Aggregation still returning duplicates


(Andrew) #1

Hi guys,

I'm try to get all unique values for a field that is not analyzed but I am still getting duplicates:

Mapping:

 {
   "County":{
  "type":"string",
  "fielddata":true,
  "fields":{
     "raw":{
        "type":"string",
        "index":"not_analyzed"
     }
  }
   }
}

Query:

{
"size":999,
"_source":["County"],
"aggs": {
  "group_by_name": {
    "terms": { "field":"County.raw" },
    "aggs": {
      "remove_dups": {
        "top_hits": {
          "size": 1,
          "_source": false
        }
      }
    }
  }
}
}

Results:

{"_index":"properties","_type":"industrial","_id":"645","_score":1.0,"_source":{"County":"Oakland"}},{"_index":"properties","_type":"industrial","_id":"646","_score":1.0,"_source":{"County":"Oakland"}}

Any ideas as to why the aggregation is being ignored ?


(David Pilato) #2

Is it with an old version of elasticsearch?


(Andrew) #3

I'm currently running version 5.6.5


(David Pilato) #4

I tried this:

DELETE test
PUT test
{
  "mappings": {
    "doc": {
      "properties": {
        "County": {
          "type": "text",
          "fields": {
            "raw": {
              "type": "keyword"
            }
          }
        }
      }
    }
  }
}
PUT test/doc/1
{
  "County": "Oakland"
}
PUT test/doc/2
{
  "County": "Oakland"
}
GET test/_search
{
  "size": 999,
  "_source": [
    "County"
  ],
  "aggs": {
    "group_by_name": {
      "terms": {
        "field": "County.raw"
      },
      "aggs": {
        "remove_dups": {
          "top_hits": {
            "size": 1,
            "_source": false
          }
        }
      }
    }
  }
}

And I'm getting this:

{
  "took": 12,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 1,
    "hits": [
      {
        "_index": "test",
        "_type": "doc",
        "_id": "2",
        "_score": 1,
        "_source": {
          "County": "Oakland"
        }
      },
      {
        "_index": "test",
        "_type": "doc",
        "_id": "1",
        "_score": 1,
        "_source": {
          "County": "Oakland"
        }
      }
    ]
  },
  "aggregations": {
    "group_by_name": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "Oakland",
          "doc_count": 2,
          "remove_dups": {
            "hits": {
              "total": 2,
              "max_score": 1,
              "hits": [
                {
                  "_index": "test",
                  "_type": "doc",
                  "_id": "2",
                  "_score": 1
                }
              ]
            }
          }
        }
      ]
    }
  }
}

(Andrew) #5

Should the query not be returning only a single "Oakland" or am I mistaken ?


(Val Crettaz) #6

The hits section returns all matching documents, two in your case, which is correct.
The aggregations sections returns the unique counties, one in your case, which is correct.


(system) #7

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.