Possible bug with 6.3 -- Analyzer not working correctly


(Jason Baumgartner) #1

I have the following mapping:

{
    "settings": {
        "index.codec": "best_compression",
        "index.shard.check_on_startup": "checksum",
        "index": {
            "number_of_shards": 10,
            "number_of_replicas": 0,
            "sort.field" : "created_at", 
            "sort.order" : "desc"
        },
        "analysis": {
            "analyzer": {
                "custom": {
                    "filter": ["lowercase"],
                    "tokenizer": "whitespace"
                }
            },
            "normalizer": {
                "my_normalizer": {
                    "type": "custom",
                    "char_filter": [],
                    "filter": ["lowercase", "asciifolding"]
                }
            }
        }
    },
    "mappings": {
        "user": {
            "_routing": {
                "required": false
            },
            "_all": {
                "enabled": false
            },
            "_source": {
                "enabled": true,
                "excludes": ["id"]
            },
            "dynamic": "false",
            "properties": {
                "description": {
                    "type": "text",
                    "analyzer": "custom"
                },
                "location": {
                    "type": "text",
                    "analyzer": "custom",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 128
                        }
                    }
                },
                "name": {
                    "type": "text",
                    "analyzer": "custom",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 128
                        }
                    }
                },
                "followers_count": {
                    "type": "integer"
                },
                "friends_count": {
                    "type": "integer"
                },
                "listed_count": {
                    "type": "integer"
                },
                "favourites_count": {
                    "type": "integer"
                },
                "statuses_count": {
                    "type": "integer"
                },
                "lang": {
                    "type": "keyword",
                    "normalizer": "my_normalizer"
                },
                "screen_name": {
                    "type": "keyword",
                    "normalizer": "my_normalizer"
                },
                "profile_image_url_https": {
                    "type": "keyword",
                    "normalizer": "my_normalizer"
                },
                "profile_background_image_url_https": {
                    "type": "keyword",
                    "normalizer": "my_normalizer"
                },
                "profile_use_background_image": {
                    "type": "boolean"
                },
                "verified": {
                    "type": "boolean"
                },
                "protected": {
                    "type": "boolean"
                },
                "default_profile": {
                    "type": "boolean"
                },
                "created_at": {
                    "type": "date",
                    "format": "epoch_second"
                },
                "retrieved_on": {
                    "type": "date",
                    "format": "epoch_second"
                },
                "last_updated": {
                    "type": "date",
                    "format": "epoch_second"
                },
                "id": {
                    "type": "long"
                }
            }
        }
    }
}

I tested that the analyzer works correctly to get hashtags:

curl -H "Content-Type: application/json" -XPOST localhost:9200/twitter/_analyze -d '{"text":"This is a test to see how a #hashtag is tokenized","analyzer":"custom"}'

{
"tokens" : [
{
"token" : "this",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "is",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 1
},
{
"token" : "a",
"start_offset" : 8,
"end_offset" : 9,
"type" : "word",
"position" : 2
},
{
"token" : "test",
"start_offset" : 10,
"end_offset" : 14,
"type" : "word",
"position" : 3
},
{
"token" : "to",
"start_offset" : 15,
"end_offset" : 17,
"type" : "word",
"position" : 4
},
{
"token" : "see",
"start_offset" : 18,
"end_offset" : 21,
"type" : "word",
"position" : 5
},
{
"token" : "how",
"start_offset" : 22,
"end_offset" : 25,
"type" : "word",
"position" : 6
},
{
"token" : "a",
"start_offset" : 26,
"end_offset" : 27,
"type" : "word",
"position" : 7
},
{
"token" : "#hashtag",
"start_offset" : 28,
"end_offset" : 36,
"type" : "word",
"position" : 8
},
{
"token" : "is",
"start_offset" : 37,
"end_offset" : 39,
"type" : "word",
"position" : 9
},
{
"token" : "tokenized",
"start_offset" : 40,
"end_offset" : 49,
"type" : "word",
"position" : 10
}
]
}

However, when I index and try to search on hashtags, I get no results:

curl -H "Content-Type: application/json" -XGET localhost:9200/twitter/user/_search?pretty=true -d '{"query":{"match":{"description":{"query":"#privacy"}}}}'

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 10,
    "successful" : 10,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 0,
    "max_score" : null,
    "hits" : [ ]
  }
}

Is this something that broke recently?

Bug: Elasticsearch is not using the same search analyzer as that defined for indexing.
Edit: Honestly, I don't know what's going on here. It's still not working as documented.


(Zachary Tong) #2

Can you show some of the documents that should be returned but aren't? I can try to reproduce it locally.

I've not heard anyone else encountering something like this yet, would be good to hunt down the problem.


(system) #3

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.