ElasticSearch - Phonetic and Human Names


(Nathan Smith) #1

I have a large database of names, primarily from Scotland. We're currently
producing a prototype to replace an existing piece of software which
carries out the search. This is still in production and we're aiming to get
our results as closes as possible to the current results of the same search.

I was hoping someone could help me out, I am entering in a search into
Elastic Search, the query is "Michael Heaney", I get some wild results. The
current search returns two main surnames, these are - "Heaney" and "Heavey"
all with the forename of "Michael", I can get the "Heaney" results in
Elastic Search however I can't obtain "Heavey" and ES also returns people
without the surname "Michael" however I appreciate that that's due to it
being part of the fuzzy query. I know this is a narrow use case, as it's
only one search but getting this result and knowing how I can obtain it
will help.

Thanks.

Mapping

{
   "jr": {
    "_all": {
        "enabled": true,
        "index_analyzer": "index_analyzer",
        "search_analyzer": "search_analyzer"
    },
    "properties": {
        "pty_forename": {
            "type": "string",
            "index": "analyzed",
            "boost": 2,
            "index_analyzer": "index_analyzer",
            "search_analyzer": "search_analyzer",
            "store": "yes"
        },
        "pty_full_name": {
            "type": "string",
            "index": "analyzed",
            "boost": 4,
            "index_analyzer": "index_analyzer",
            "search_analyzer": "search_analyzer",
            "store": "yes"
        },
        "pty_surname": {
            "type": "string",
            "index": "analyzed",
            "boost": 4,
            "index_analyzer": "index_analyzer",
            "search_analyzer": "search_analyzer",
            "store": "yes"
        }
     }
   }
}'

Index Settings

{
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 0,
    "analysis": {
        "analyzer": {
            "index_analyzer": {
                "tokenizer": "standard",
                "filter": [
                    "standard",
                    "my_delimiter",
                    "lowercase",
                    "stop",
                    "asciifolding",
                    "porter_stem",
                    "my_metaphone"
                ]
            },
            "search_analyzer": {
                "tokenizer": "standard",
                "filter": [
                    "standard",
                    "my_metaphone",
                    "synonym",
                    "lowercase",
                    "stop",
                    "asciifolding",
                    "porter_stem"
                ]
            }
        },
        "filter": {
            "synonym": {
                "type": "synonym",
                "synonyms_path": "synonyms/synonyms.txt"
            },
            "my_delimiter": {
                "type": "word_delimiter",
                "generate_word_parts": true,
                "catenate_words": false,
                "catenate_numbers": false,
                "catenate_all": false,
                "split_on_case_change": false,
                "preserve_original": false,
                "split_on_numerics": false,
                "stem_english_possessive": false
            },
            "my_metaphone": {
                "type": "phonetic",
                "encoder": "metaphone",
                "replace": false
            }
        }
     }
   }
}'

Fuzzy

{
"from":0, "size":100,
"query": {
    "bool": {
        "should": [
            {
                "fuzzy": {
                    "pty_surname": {
                        "min_similarity": 0.2,
                        "value": "Heaney",
                        "prefix_length": 0,
                        "boost": 5
                    }
                }
            },
            {
                "fuzzy": {
                    "pty_forename": {
                        "min_similarity": 1,
                        "value": "Michael",
                        "prefix_length": 0,
                        "boost": 1
                    }
                }
            }
        ]
     }
  }
}

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/4e2421cb-96b6-48b0-8143-9e3828641366%40googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.


(system) #2