Hi there,
I have been using Elasticsearch to analyze a text with a customized analyzer in Elasticsearch. Then using the _mtermvectors to get for each keyword the statistics. While trying it out, I found an inconsistency in the _mtermvectors API.
My question is: how is possible for a keyword like day in one doc (1) the doc_freq is 15, and another doc (3) the doc_freq is 16!? Below the example:
This is the index mapping and settings:
PUT /gdelt_pos
{
"mappings": {
"properties": {
"source": {
"type": "text"
},
"nouns": {
"type": "text",
"analyzer" : "keywords_analyzer"
},
"verbs": {
"type": "text",
"analyzer" : "keywords_analyzer"
}
}
},
"settings" : {
"index" : {
"number_of_shards" : 5,
"number_of_replicas" : 1
},
"analysis": {
"analyzer": {
"keywords_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop_words",
"asciifolding",
"apostrophe",
"exclude_nums",
"stemmer"
]
}
},
"filter":{
"stop_words": {
"type": "stop",
"ignore_case": true
},
"exclude_nums": {
"type": "keep_types",
"types": [ "<NUM>" ],
"mode": "exclude"
},
"stemmer" : {
"type" : "stemmer",
"name" : "porter2"
}
}
}
}
}
This is the _mtermvectors query
POST /gdelt_pos/_mtermvectors
{
"ids" : ["1", "3"],
"parameters": {
"fields": [
"nouns"
],
"offsets" : false,
"payloads" : true,
"positions" : false,
"field_statistics" : true,
"term_statistics" : true,
"filter" : {
"min_term_freq" : 2,
"min_doc_freq" : 15
}
}
}
This is the response
{
"docs" : [
{
"_index" : "gdelt_pos",
"_type" : "_doc",
"_id" : "1",
"_version" : 1,
"found" : true,
"took" : 1,
"term_vectors" : {
"nouns" : {
"field_statistics" : {
"sum_doc_freq" : 5954,
"doc_count" : 60,
"sum_ttf" : 10875
},
"terms" : {
"day" : {
"doc_freq" : 15,
"ttf" : 33,
"term_freq" : 2,
"score" : 4.6765704
},
"peopl" : {
"doc_freq" : 17,
"ttf" : 30,
"term_freq" : 3,
"score" : 6.6615067
},
"work" : {
"doc_freq" : 16,
"ttf" : 26,
"term_freq" : 6,
"score" : 13.665964
}
}
}
}
},
{
"_index" : "gdelt_pos2",
"_type" : "_doc",
"_id" : "3",
"_version" : 1,
"found" : true,
"took" : 1,
"term_vectors" : {
"nouns" : {
"field_statistics" : {
"sum_doc_freq" : 6532,
"doc_count" : 69,
"sum_ttf" : 11388
},
"terms" : {
"august" : {
"doc_freq" : 16,
"ttf" : 22,
"term_freq" : 3,
"score" : 7.245846
},
"day" : {
"doc_freq" : 16,
"ttf" : 25,
"term_freq" : 2,
"score" : 4.830564
},
"govern" : {
"doc_freq" : 19,
"ttf" : 42,
"term_freq" : 2,
"score" : 4.505526
},
"today" : {
"doc_freq" : 17,
"ttf" : 19,
"term_freq" : 2,
"score" : 4.716247
}
}
}
}
}
]
}
Thank you in advance and hope someone can help me