Aggregations distinct doc_count

Hi,

I have an index with candidates and the candidates have skills as nested objects. I am using nested aggregations to build facets with a count for each skill. While the "doc_count" works great on the aggregation it does not filter for duplicate values. So in a scenario where I have 2 Candidates with the skill "HTML" and one of those candidates have the skill "HTML" twice then the doc_count on the aggregations comes to a total of 3. I need the count to show 2 instead of 3 in a case like this.

This is my mapping:

{
  "mappings": {"_doc": {
	"properties": {
		"first_names": { "type": "text" },
		"last_name": { "type": "text" },
        "gender": { "type": "text" },
		"country": { "type": "text" },
		"province": { "type": "text" },
		"city": { "type": "text" },
        "skills": {          
           "type": "nested",
              "properties": {
                 "name": {
                    "type": "keyword"
                 }
              }
            }
		}
	}
  }
}

Sample data could look like this:

PUT localhost:9200/candidates

{
    "first_names":  "John",
    "last_name" :   "Smith",
    "gender" :      "Male",
    "country":      "South Africa",
    "province":     "Gauteng",
    "city":         "Johannesburg"
    "skills":       [
      {
        "name": "HTML"
      },
      {
        "name": "HTML"
      }
    ]
}

{
    "first_names":  "Jane",
    "last_name" :   "Smith",
    "gender" :      "Female",
    "country":      "South Africa",
    "province":     "Gauteng",
    "city":         "Pretoria"
    "skills":       [
      {
        "name": "HTML"
      },
      {
        "name": "CSS"
      }
    ]
}

My query looks like this:

{
	"query": {
		"bool": {
			"should": [
				{
					"multi_match" : {
						"query":      "",
						"type":       "best_fields",
						"fields":     [ "first_names", "last_name"]
					}
				}
			],
			"must": {
				"nested": {
					"path": "skill",
					"query": {
						"bool": {
							"must": {
								"match": {
									"skill.name": "HTML"
								}
							}
						}
					}
				}
			}
		}
	},
	"aggs" : {
		"skills": {
			"nested": {
				"path": "skill"
			},
			"aggs": {
				"skills": {
					"terms" : { "field" : "skill.name" } 
				}
			}
		}
	}
}

The result could look something like this:

{
  "took": 639,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 4.1932936,
    "hits": [
      {
        "_index": "candidates",
        "_type": "_doc",
        "_id": "1",
        "_score": 4.1932936,
        "_source": {
          "first_names":  "John",
          "last_name" :   "Smith",
          "gender" :      "Male",
          "country":      "South Africa",
          "province":     "Gauteng",
          "city":         "Johannesburg",
          "skill": [
            {
              "name": "HTML"
            },
            {
              "name": "HTML"
            }
          ]
        }
      },
      {
        "_index": "candidates",
        "_type": "_doc",
        "_id": "2",
        "_score": 4.1932936,
        "_source": {
          "first_names":  "Jane",
          "last_name" :   "Smith",
          "gender" :      "Female",
          "country":      "South Africa",
          "province":     "Gauteng",
          "city":         "Pretoria",
          "skill": [
            {
              "name": "HTML"
            },
            {
              "name": "CSS"
            }
          ]
        }
      }
    ]
  },
  "aggregations": {
    "skills": {
      "doc_count": 4,
      "skills": {
        "doc_count_error_upper_bound": 1,
        "sum_other_doc_count": 3,
        "buckets": [
          {
            "key": "HTML",
            "doc_count": 3
          },
          {
            "key": "CSS",
            "doc_count": 1
          }
        ]
      }
    }
  }
}

If there is anyway to filter the nested aggregations doc_count to be unique it would be great, any assistance is much appreciated.

1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.