ElasticSearch: Order top-level aggregation buckets based on reverse_nested doc_count

afasias · July 9, 2021, 7:58am

Hi.
My Elasticsearch info:

{
    "version": {
        "number": "6.3.1",
        "build_flavor": "oss",
        "build_type": "zip",
        "build_hash": "9d3e794",
        "build_date": "2020-06-09T23:40:32.185794Z",
        "build_snapshot": false,
        "lucene_version": "7.3.1",
        "minimum_wire_compatibility_version": "5.6.0",
        "minimum_index_compatibility_version": "5.0.0"
    }
}

I'm working on an aggregation with multiple sub-aggregations in which I need to order the top-level aggregation buckets based on the doc_count of a lower-level reverse_nested aggregation.

This is how my index is created:

PUT /myindex
{
  "mappings": {
    "default": {
      "properties": {
        "items": {
          "type": "nested",
          "properties": {
            "subitems": {
              "type": "nested",
              "properties": {
                "id": {
                  "type": "long"
                },
                "name": {
                  "type": "keyword"
                }
              }
            }
          }
        },
        "name": {
          "type": "keyword"
        }
      }
    }
  }
}

And these are the sample documents that I have indexed:

{
  "name": "Document #1",
  "items": [
    {
      "subitems": [
        {
          "id": 1,
          "name": "Subitem #1"
        },
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        },
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    }
  ]
}

{
  "name": "Document #2",
  "items": [
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    }
  ]
}

{
  "name": "Document #3",
  "items": [
    {
      "subitems": [
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    }
  ]
}

{
  "name": "Document #4",
  "items": [
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        },
        {
          "id": 5,
          "name": "Subitem #5"
        }
      ]
    }
  ]
}

{
  "name": "Document #5",
  "items": [
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    },
    {
      "subitems": [
        {
          "id": 2,
          "name": "Subitem #2"
        }
      ]
    }
  ]
}

{
  "name": "Document #6",
  "items": [
    {
      "subitems": [
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    }
  ]
}

{
  "name": "Document #7",
  "items": [
    {
      "subitems": [
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    }
  ]
}

{
  "name": "Document #8",
  "items": [
    {
      "subitems": [
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    }
  ]
}

{
  "name": "Document #9",
  "items": [
    {
      "subitems": [
        {
          "id": 3,
          "name": "Subitem #3"
        }
      ]
    }
  ]
}

I need my aggregation to be able to extract the number of Documents that contain each sub-item id/name pair. (consider subitem ids always correspond to the same subitem name).
That is:

id | name       | count
---+------------+------
2  | Subitem #2 | 5
3  | Subitem #3 | 6
1  | Subitem #1 | 1
5  | Subitem #5 | 1

This is the original aggregation query:

GET /myindex/default/_search
{
  "size": 0,
  "aggregations": {
    "my_nested_agg": {
      "nested": {
        "path": "items.subitems"
      },
      "aggregations": {
        "subitem_id": {
          "terms": {
            "field": "items.subitems.id"
          },
          "aggregations": {
            "subitem_name": {
              "terms": {
                "field": "items.subitems.name"
              },
              "aggregations": {
                "my_rev_agg": {
                  "reverse_nested": {}
                }
              }
            }
          }
        }
      }
    }
  }
}

The aggregation seems to return all the data I need:

{
  "took": 0,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0.0,
    "hits": []
  },
  "aggregations": {
    "my_nested_agg": {
      "doc_count": 19,
      "subitem_id": {
        "doc_count_error_upper_bound": 0,
        "sum_other_doc_count": 0,
        "buckets": [
          {
            "key": 2,
            "doc_count": 11,
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #2",
                  "doc_count": 11,
                  "my_rev_agg": {
                    "doc_count": 5
                  }
                }
              ]
            }
          },
          {
            "key": 3,
            "doc_count": 6,
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #3",
                  "doc_count": 6,
                  "my_rev_agg": {
                    "doc_count": 6
                  }
                }
              ]
            }
          },
          {
            "key": 1,
            "doc_count": 1,
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #1",
                  "doc_count": 1,
                  "my_rev_agg": {
                    "doc_count": 1
                  }
                }
              ]
            }
          },
          {
            "key": 5,
            "doc_count": 1,
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #5",
                  "doc_count": 1,
                  "my_rev_agg": {
                    "doc_count": 1
                  }
                }
              ]
            }
          }
        ]
      }
    }
  }
}

However, the buckets are ordered in descending order based on the doc_count of the "subitem_id" sub-aggregation.

Instead, I need the buckets to be ordered in descending order based the doc_count of the reverse_nested sub-aggregation. Like this:

id | name       | count
---+------------+------
3  | Subitem #3 | 6
2  | Subitem #2 | 5
1  | Subitem #1 | 1
5  | Subitem #5 | 1

I tried to achieve this with the following query:

GET /myindex/default/_search
{
  "size": 0,
  "aggregations": {
    "my_nested_agg": {
      "nested": {
        "path": "items.subitems"
      },
      "aggregations": {
        "subitem_id": {
          "terms": {
            "field": "items.subitems.id",
            "order": [
              {
                "subitem_name>my_rev_agg._count": "desc"
              }
            ]
          },
          "aggregations": {
            "subitem_name": {
              "terms": {
                "field": "items.subitems.name"
              },
              "aggregations": {
                "my_rev_agg": {
                  "reverse_nested": {}
                }
              }
            }
          }
        }
      }
    }
  }
}

But then I get the error:

Invalid aggregation order path [subitem_name>my_rev_agg._count]. Buckets can only be sorted on a sub-aggregator path that is built out of zero or more single-bucket aggregations within the path and a final single-bucket or a metrics aggregation at the path end. Sub-path [subitem_name] points to non single-bucket aggregation

Could you please advise.
Thank you very much in advance.

afasias · July 9, 2021, 10:23am

I found a solution that does what I wanted. The key point is to move the reverse_nested aggregation outside the terms sub-aggregation used to retrieve the name:

GET /myindex/default/_search
{
  "size": 0,
  "aggregations": {
    "my_nested_agg": {
      "nested": {
        "path": "items.subitems"
      },
      "aggregations": {
        "subitem_id": {
          "terms": {
            "field": "items.subitems.id",
            "order": [
              {
                "my_rev_agg": "desc"
              }
            ]
          },
          "aggregations": {
            "subitem_name": {
              "terms": {
                "field": "items.subitems.name"
              }
            },
            "my_rev_agg": {
              "reverse_nested": {}
            }
          }
        }
      }
    }
  }
}

This return the subitem buckets ordered correctly:

{
  "took": 0,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0.0,
    "hits": []
  },
  "aggregations": {
    "my_nested_agg": {
      "doc_count": 19,
      "subitem_id": {
        "doc_count_error_upper_bound": 0,
        "sum_other_doc_count": 0,
        "buckets": [
          {
            "key": 3,
            "doc_count": 6,
            "my_rev_agg": {
              "doc_count": 6
            },
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #3",
                  "doc_count": 6
                }
              ]
            }
          },
          {
            "key": 2,
            "doc_count": 11,
            "my_rev_agg": {
              "doc_count": 5
            },
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #2",
                  "doc_count": 11
                }
              ]
            }
          },
          {
            "key": 1,
            "doc_count": 1,
            "my_rev_agg": {
              "doc_count": 1
            },
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #1",
                  "doc_count": 1
                }
              ]
            }
          },
          {
            "key": 5,
            "doc_count": 1,
            "my_rev_agg": {
              "doc_count": 1
            },
            "subitem_name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "key": "Subitem #5",
                  "doc_count": 1
                }
              ]
            }
          }
        ]
      }
    }
  }
}

system · August 6, 2021, 10:24am

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Sorting reverse nested aggregations Elasticsearch	4	1347	July 5, 2017
Reverse Nested Aggregation Sort Elasticsearch	1	657	July 6, 2017
How do I sort buckets by Term Aggregation's doc_count? Elasticsearch	2	13986	September 18, 2018
Sorting "children" aggregations Elasticsearch	2	1046	July 17, 2017
Getting nested document in sorted order Elasticsearch	2	675	July 5, 2017

ElasticSearch: Order top-level aggregation buckets based on reverse_nested doc_count

Related topics