Performance issue when searching data in a large array


(Dexin Li) #1

Hello,

We have an ES index of 14.2GB with 880,000+ docs。
Each doc has structure like this:

{
    "_index": "xxx",
    "_type": "playlist",
    "_id": "xxx",
    "_score": 0,
    "_source": {
        "id": 6603695721583499000,
        "track": [
            "tenfeettalldavidguettaremix",
            "fivemorehoursdeorroxchrisbrown",
            "iloveitwhenyoucrymoxokiradioedit",
            "thinkingaboutyou",
            "desiregryffinremix",
            "whereareünowwithjustinbieber",
            "whereareunowwithjustinbieber",
            "oceandrivemichaelcalfanremix",
            "dareyou2move",
            "heynowfeatkylesinglemix",
            "purplelightfewwolvesbastiaanremix",
            "fantasyfelixjaehnremix",
            "geckooverdriveradioedit",
            "sugarfeatfrancescoyates",
            "spaceman",
            "yellow",
            "fastcar",
            "thistime",
            "oftenkygoremix",
            "readyforyourlove",
            "anotheryou",
            "latch",
            "thisgirlkungsvscookinon3burnerskungsvscookinon3burners",
            "faded",
            "calabriafirebeatzremix",
            "sweatsnoopdoggvsdavidguettaremix",
            "cantgohomeradioedit",
            "thefeeling",
            "liarliar",
            "thisiswhatyoucamefor",
            "nothingreallymattersafrojackremix"
        ]
    }
}

The track field usually has 100-1000 strings.
When querying term in track field like:

{
  "query": {
    "bool": {
      "filter": {
        "term": {
          "track": "yellow"
        }
      }
    }
  }
}

it costs 100ms each time, which is too slow for us.
And here is our index information:

{
    "state": "open",
    "settings": {
        "index": {
            "number_of_shards": "5",
            "provided_name": "xxxx",
            "creation_date": "xxx",
            "unassigned": {
                "node_left": {
                    "delayed_timeout": "30m"
                }
            },
            "analysis": {
                "filter": {
                    "autocomplete_filter": {
                        "type": "edge_ngram",
                        "min_gram": "1",
                        "max_gram": "20"
                    },
                    "trigrams_filter": {
                        "type": "ngram",
                        "min_gram": "3",
                        "max_gram": "3"
                    }
                },
                "analyzer": {
                    "trigrams_analyzer": {
                        "filter": [
                            "lowercase",
                            "trigrams_filter"
                        ],
                        "type": "custom",
                        "tokenizer": "standard"
                    }
                }
            },
            "number_of_replicas": "0",
            "version": {
                "created": "5050399"
            }
        }
    },
    "mappings": {
        "playlist": {
            "include_in_all": false,
            "properties": {
                "track": {
                    "type": "keyword",
                    "fields": {
                        "autocomplete": {
                            "search_analyzer": "standard",
                            "analyzer": "autocomplete_analyzer",
                            "type": "text"
                        },
                        "trigrams": {
                            "search_analyzer": "standard",
                            "analyzer": "trigrams_analyzer",
                            "type": "text"
                        }
                    }
                }
            }
        }
    }
}

Thanks


(Dexin Li) #2

After using source filtering, we halve the latency now.