I have the following mapping:
{
"settings": {
"index.codec": "best_compression",
"index.shard.check_on_startup": "checksum",
"index": {
"number_of_shards": 10,
"number_of_replicas": 0,
"sort.field" : "created_at",
"sort.order" : "desc"
},
"analysis": {
"analyzer": {
"custom": {
"filter": ["lowercase"],
"tokenizer": "whitespace"
}
},
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"user": {
"_routing": {
"required": false
},
"_all": {
"enabled": false
},
"_source": {
"enabled": true,
"excludes": ["id"]
},
"dynamic": "false",
"properties": {
"description": {
"type": "text",
"analyzer": "custom"
},
"location": {
"type": "text",
"analyzer": "custom",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 128
}
}
},
"name": {
"type": "text",
"analyzer": "custom",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 128
}
}
},
"followers_count": {
"type": "integer"
},
"friends_count": {
"type": "integer"
},
"listed_count": {
"type": "integer"
},
"favourites_count": {
"type": "integer"
},
"statuses_count": {
"type": "integer"
},
"lang": {
"type": "keyword",
"normalizer": "my_normalizer"
},
"screen_name": {
"type": "keyword",
"normalizer": "my_normalizer"
},
"profile_image_url_https": {
"type": "keyword",
"normalizer": "my_normalizer"
},
"profile_background_image_url_https": {
"type": "keyword",
"normalizer": "my_normalizer"
},
"profile_use_background_image": {
"type": "boolean"
},
"verified": {
"type": "boolean"
},
"protected": {
"type": "boolean"
},
"default_profile": {
"type": "boolean"
},
"created_at": {
"type": "date",
"format": "epoch_second"
},
"retrieved_on": {
"type": "date",
"format": "epoch_second"
},
"last_updated": {
"type": "date",
"format": "epoch_second"
},
"id": {
"type": "long"
}
}
}
}
}
I tested that the analyzer works correctly to get hashtags:
curl -H "Content-Type: application/json" -XPOST localhost:9200/twitter/_analyze -d '{"text":"This is a test to see how a #hashtag is tokenized","analyzer":"custom"}'
{
"tokens" : [
{
"token" : "this",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "is",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 1
},
{
"token" : "a",
"start_offset" : 8,
"end_offset" : 9,
"type" : "word",
"position" : 2
},
{
"token" : "test",
"start_offset" : 10,
"end_offset" : 14,
"type" : "word",
"position" : 3
},
{
"token" : "to",
"start_offset" : 15,
"end_offset" : 17,
"type" : "word",
"position" : 4
},
{
"token" : "see",
"start_offset" : 18,
"end_offset" : 21,
"type" : "word",
"position" : 5
},
{
"token" : "how",
"start_offset" : 22,
"end_offset" : 25,
"type" : "word",
"position" : 6
},
{
"token" : "a",
"start_offset" : 26,
"end_offset" : 27,
"type" : "word",
"position" : 7
},
{
"token" : "#hashtag",
"start_offset" : 28,
"end_offset" : 36,
"type" : "word",
"position" : 8
},
{
"token" : "is",
"start_offset" : 37,
"end_offset" : 39,
"type" : "word",
"position" : 9
},
{
"token" : "tokenized",
"start_offset" : 40,
"end_offset" : 49,
"type" : "word",
"position" : 10
}
]
}
However, when I index and try to search on hashtags, I get no results:
curl -H "Content-Type: application/json" -XGET localhost:9200/twitter/user/_search?pretty=true -d '{"query":{"match":{"description":{"query":"#privacy"}}}}'
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 10,
"successful" : 10,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
Is this something that broke recently?
Bug: Elasticsearch is not using the same search analyzer as that defined for indexing.
Edit: Honestly, I don't know what's going on here. It's still not working as documented.