Terms aggregations cannot get accurate number of record


(熊运强) #1

my aggregation query like this :

   {
      "size": 0,
      "query": {
        "bool": {
          "must": [
            {
              "term": {
                "month_id": {
                  "value": 154,
                  "boost": 1
                }
              }
            }
          ]
        }
      },
      "aggregations": {
        "group_by_thread_id": {
          "terms": {
            "field": "thread_id",
            "size": 50,
            "shard_size": -1,
            "min_doc_count": 1,
            "shard_min_doc_count": 0,
            "show_term_doc_count_error": false,
            "order": [
              {
                "_count": "desc"
              },
              {
                "_term": "asc"
              }
            ]
          }
        }
      },
      "highlight": {},
      "ext": {}
    }

response like this as fellow:

      {
    "took": 498,
    "timed_out": false,
    "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
    },
    "hits": {
    "total": 7140222,
    "max_score": 0,
    "hits": [ ]
    },
    "aggregations": {
    "group_by_thread_id": {
    "doc_count_error_upper_bound": 10,
    "sum_other_doc_count": 7140093,
    "buckets": [
    {
    "key": 4028841712554024,
    "doc_count": 7
    }
    ,
    {
    "key": 4032481949343890,
    "doc_count": 6
    }
    ,
    {
    "key": 4027269313757139,
    "doc_count": 3
    }........

but when I query by term the key 4028841712554024 ,query like this :

    {
    "size":0,
      "query": {
        "bool": {
          "must": [
            {
              "term": {
                "thread_id": {
                  "value": "4028841712554024",
                  "boost": 1
                }
              }
            }
          ]
        }
      },
      "highlight": {},
      "ext": {}
    }

response like this :

{
"took": 2,
"timed_out": false,
"_shards": {
"total": 55,
"successful": 55,
"failed": 0
},
"hits": {
"total": 8,
"max_score": 0,
"hits": [ ]
}
}

my question is : aggregations how can I get accurate value for the record ?

thanks and regards


(Mark Harwood) #2

Is missing the month_id constraint from the original query?


(熊运强) #3

hi @Mark_Harwood morning, I add new filter month_id like this

{
  "size": 0,
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "month_id": {
              "value": 154,
              "boost": 1
            }
          }
        },
        {
          "bool": {
            "must": [
              {
                "term": {
                  "thread_id": {
                    "value": "4028841712554024",
                    "boost": 1
                  }
                }
              }
            ]
          }
        }
      ]
    }
  }
}

but the result still same

      {
    "took": 2,
    "timed_out": false,
    "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
    },
    "hits": {
    "total": 8,
    "max_score": 0,
    "hits": [ ]
    }
    }

(Russ Cam) #4

Please format (and indent) your code using </> button at the top of the text editor; it makes reading it much easier, thanks!


(Kenny Wu) #5

One possible reason is this thread_id field is doc valued but not indexed . Can you provide mapping for this index?


(熊运强) #6

hi @kwu my index metadata info like this

  {
    "state": "open",
    "settings": {
    "index": {
    "number_of_shards": "5",
    "provided_name": "exp_index_1506",
    "max_result_window": "10000000",
    "creation_date": "1483496343921",
    "number_of_replicas": "1",
    "uuid": "4Bhxz-NJQg6oBmIAJAJiSA",
    "version": {
    "created": "5000099"
    }
    }
    },
    "mappings": {
    "ds_item": {
    "properties": {
    "sentiment": {
    "type": "long"
    },
    "date_of_post": {
    "type": "date"
    },
    "item_type_id": {
    "type": "integer"
    },
    "year": {
    "type": "integer"
    },
    "item_type": {
    "type": "keyword"
    },
    "month_id": {
    "type": "integer"
    },
    "subject": {
    "analyzer": "smartcn",
    "type": "text",
    "index_options": "offsets"
    },
    "item_key": {
    "type": "keyword"
    },
    "is_topic_post": {
    "type": "boolean"
    },
    "cicforum_id": {
    "type": "long"
    },
    "sys_site_id": {
    "type": "integer"
    },
    "source": {
    "type": "keyword"
    },
    "sys_thread_id": {
    "type": "long"
    },
    "day_of_month": {
    "type": "integer"
    },
    "cicsite_id": {
    "type": "long"
    },
    "content": {
    "analyzer": "smartcn",
    "type": "text",
    "index_options": "offsets"
    },
    "forum_id": {
    "type": "keyword"
    },
    "domain_id": {
    "type": "long"
    },
    "forum_url": {
    "type": "keyword"
    },
    "thread_id": {
    "type": "long"
    },
    "project_id": {
    "type": "integer"
    },
    "keyword_group_id": {
    "type": "integer"
    },
    "pkey": {
    "type": "keyword"
    },
    "keyword": {
    "type": "keyword"
    },
    "keyword_group": {
    "type": "keyword"
    },
    "sys_poster_id": {
    "type": "long"
    },
    "thread_month_id": {
    "type": "long"
    },
    "time_id": {
    "type": "integer"
    },
    "item_id": {
    "type": "long"
    },
    "week_id": {
    "type": "integer"
    },
    "domain_1": {
    "type": "keyword"
    },
    "driver_categories": {
    "type": "text"
    },
    "poster_url": {
    "type": "keyword"
    },
    "tm_site_name": {
    "type": "keyword"
    },
    "site_name": {
    "type": "keyword"
    },
    "month": {
    "type": "integer"
    },
    "forum_name": {
    "type": "keyword"
    },
    "product_categories": {
    "type": "text"
    },
    "domain": {
    "type": "keyword"
    },
    "item_url": {
    "type": "keyword"
    },
    "poster_month_id": {
    "type": "long"
    },
    "site_id": {
    "type": "integer"
    },
    "poster_id": {
    "type": "keyword"
    },
    "source_id": {
    "type": "integer"
    },
    "poster": {
    "type": "keyword"
    }
    }
    }
    },
    "aliases": [ ],
    "primary_terms": {
    "0": 1,
    "1": 1,
    "2": 1,
    "3": 1,
    "4": 1
    },
    "in_sync_allocations": {
    "0": [
    "1RroiCrGSAmmskf4PagBtg"
    ,
    "9zERaadWR3ucZb2GCg8cGQ"
    ],
    "1": [
    "TZk52uZbTDuVT9HNl0eL_Q"
    ,
    "XT_UGhyEQoKmJo113Us80w"
    ],
    "2": [
    "tjrsRCdsQdW6oRiggEU3WA"
    ,
    "OLAZ0IgnSPa3ppmTdReWZQ"
    ],
    "3": [
    "yZd4PsgoQ1aKkzkryImBlg"
    ,
    "jvNZGRt_RjKjVftSOQ2ytQ"
    ],
    "4": [
    "W87Nmg3mSx6GA0Y9-VGwzQ"
    ,
    "bS7HdGIlQKSlkBifU9WbOw"
    ]
    }
    }

(Kenny Wu) #7

Are the two queries on the same index? The first response shows hitting on 5 shards while the second on 55 shards?


(熊运强) #8

the second query because I forget the month_id and the index name , may be you can reference the third one .


(Kenny Wu) #9

From the response you can see there are 8 hits.total . The reason that hits array is empty is that you specified "size":0 in term query!


(system) #10

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.