Inconsistent doc_freq in _mtermvectors

Hi there,

I have been using Elasticsearch to analyze a text with a customized analyzer in Elasticsearch. Then using the _mtermvectors to get for each keyword the statistics. While trying it out, I found an inconsistency in the _mtermvectors API.

My question is: how is possible for a keyword like day in one doc (1) the doc_freq is 15, and another doc (3) the doc_freq is 16!? Below the example:

This is the index mapping and settings:

PUT /gdelt_pos
{ 
 "mappings": {
    "properties": {
      "source": {
        "type": "text"
       },
       "nouns": {
        "type": "text",
        "analyzer" : "keywords_analyzer"
      },
       "verbs": {
        "type": "text",
        "analyzer" : "keywords_analyzer"
      }
    }
  },
  "settings" : {
    "index" : {
      "number_of_shards" : 5,
      "number_of_replicas" : 1
    },
    "analysis": {
      "analyzer": {
        "keywords_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop_words",
            "asciifolding",
            "apostrophe",
            "exclude_nums",
            "stemmer"
          ]
        }
      }, 
      "filter":{
        "stop_words": {
          "type": "stop",
          "ignore_case": true
        },
        "exclude_nums": {
          "type": "keep_types",
          "types": [ "<NUM>" ],
          "mode": "exclude"
        },
        "stemmer" : {
          "type" : "stemmer",
          "name" : "porter2"
        }
      }
    }
  }
}

This is the _mtermvectors query

POST /gdelt_pos/_mtermvectors
{
  "ids" : ["1", "3"],
  "parameters": {
  	"fields": [
       	"nouns"
    	],
      "offsets" : false,
      "payloads" : true,
      "positions" : false,
      "field_statistics" : true,
      "term_statistics" : true,
      "filter" : {
          "min_term_freq" : 2,
          "min_doc_freq" : 15
        }
    }
}

This is the response

{
  "docs" : [
    {
      "_index" : "gdelt_pos",
      "_type" : "_doc",
      "_id" : "1",
      "_version" : 1,
      "found" : true,
      "took" : 1,
      "term_vectors" : {
        "nouns" : {
          "field_statistics" : {
            "sum_doc_freq" : 5954,
            "doc_count" : 60,
            "sum_ttf" : 10875
          },
          "terms" : {
            "day" : {
              "doc_freq" : 15,
              "ttf" : 33,
              "term_freq" : 2,
              "score" : 4.6765704
            },
            "peopl" : {
              "doc_freq" : 17,
              "ttf" : 30,
              "term_freq" : 3,
              "score" : 6.6615067
            },
            "work" : {
              "doc_freq" : 16,
              "ttf" : 26,
              "term_freq" : 6,
              "score" : 13.665964
            }
          }
        }
      }
    },
    {
      "_index" : "gdelt_pos2",
      "_type" : "_doc",
      "_id" : "3",
      "_version" : 1,
      "found" : true,
      "took" : 1,
      "term_vectors" : {
        "nouns" : {
          "field_statistics" : {
            "sum_doc_freq" : 6532,
            "doc_count" : 69,
            "sum_ttf" : 11388
          },
          "terms" : {
            "august" : {
              "doc_freq" : 16,
              "ttf" : 22,
              "term_freq" : 3,
              "score" : 7.245846
            },
            "day" : {
              "doc_freq" : 16,
              "ttf" : 25,
              "term_freq" : 2,
              "score" : 4.830564
            },
            "govern" : {
              "doc_freq" : 19,
              "ttf" : 42,
              "term_freq" : 2,
              "score" : 4.505526
            },
            "today" : {
              "doc_freq" : 17,
              "ttf" : 19,
              "term_freq" : 2,
              "score" : 4.716247
            }
          }
        }
      }
    }
  ]
}

Thank you in advance and hope someone can help me :slight_smile:

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.