Custom analyzer registered but not used


(phoenix) #1

Hi everyone,

I'm facing a curious problem.

I defined an analyzer in my settings, this way :

{
"index":{
"cluster.name":"test-cluster",
"client.transport.sniff":true,
"analysis":{
"filter":{
"french_elision":{
"type":"elision",
"articles":[
...skipped...
]
},
"french_stop":{
"type":"stop",
"stopwords":"french",
"ignore_case":true
},
"snowball":{
"type":"snowball",
"language":"french"
}
},
"analyzer":{
"my_french":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"french_elision",
"lowercase",
"french_stop",
"snowball"
]
},
"lower_analyzer":{
"type":"custom",
"tokenizer":"keyword",
"filter":"lowercase"
},
"token_analyzer":{
"type":"custom",
"tokenizer":"whitespace"
}
}
}
}
}

And in my mapping, i have two text fields. On one of them i specify
explicitly the analyzer to use to my_french, and on the other i let the
global mapping analyzer (also set to my_french) kick in automatically. Here
is the mapping.

{
"record":{
"_all":{
"enabled":false
},
"analyzer":"my_french",
"properties":{
"_uuid":{
"type":"string",
"store":"yes",
"index":"not_analyzed"
},
"a":{
"type":"multi_field",
"fields":{
"a":{
"type":"string",
"store":"yes",
"index":"analyzed",
"analyzer":"my_french"
},
"raw":{
"type":"string",
"store":"no",
"index":"not_analyzed"
},
"tokens":{
"type":"string",
"store":"no",
"index":"analyzed",
"analyzer":"token_analyzer"
},
"lower":{
"type":"string",
"store":"no",
"index":"analyzed",
"analyzer":"lower_analyzer"
}
}
},
"g_r":{
"type":"string",
"store":"yes",
"index":"analyzed"
}
}
}
}

When i try to analyze with a REST query using my analyzer, the result is
correct :

$ curl -XGET 'localhost:9200/test-index/_analyze?analyzer=my_french&pretty=true'
-d "j'aime les chevaux"
{
"tokens" : [ {
"token" : "aim",
"start_offset" : 0,
"end_offset" : 6,
"type" : "",
"position" : 1
}, {
"token" : "cheval",
"start_offset" : 11,
"end_offset" : 18,
"type" : "",
"position" : 3
} ]
}

But when i index my data and search on it, i don't get the expected result.
So with a facet, i wanted to know what tokens have been stored in the
index, and here is the result :

$ curl -X POST "http://localhost:9200/test-index/_search?pretty=true" -d
'{"query": {"match": {"_id": "12"}},"facets": {"tokens": {"terms":
{"field": "a"}}}}'
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [ {
"_index" : "test-index",
"_type" : "record",
"_id" : "12",
"_score" : 1.0,
"_source":{"_uuid":"12","a_t":false,"a_n":false,"a":"J'aime les
chevaux","b_r":null,"b_t":false,"b_n":false,"b":1407664800000,"c_r":null,"c_t":false,"c_n":false,"c":2,"d_r":"m3","d_t":true,"d_n":false,"d":null,"e_r":null,"e_t":false,"e_n":true,"e":12,"f_r":null,"f_t":false,"f_n":false,"f":true,"g_r":"J'aime
les chevaux","g_t":false,"g_n":false,"g":12.0}
} ]
},
"facets" : {
"tokens" : {
"_type" : "terms",
"missing" : 0,
"total" : 2,
"other" : 0,
"terms" : [ {
"term" : "j'aim",
"count" : 1
}, {
"term" : "cheval",
"count" : 1
} ]
}
}
}

As you can see, the tokens are not correct. I expect 'aim' and 'cheval', as
resulting from my analysis, but i got 'j'aim' and 'cheval'.

Anyone could tell me what i missed or what i'm doing wrong here ?

Thanks !

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/28ab083e-8c87-462c-ae51-7e1d8df2ec51%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.


(system) #2