Sum aggregation over filtered data (per filter per sub level) - query help

maheshgawali · January 3, 2019, 12:42pm

I have a script (example: movie script) index with structure as given below

PUT scripts
{
  "settings": {
    "analysis": {
      "filter": {
        "email": {
          "type": "pattern_capture",
          "preserve_original": true,
          "patterns": [
            "([^@]+)",
            "(\\p{L}+)",
            "(\\d+)",
            "@(.+)",
            "([^-@]+)"
          ]
        },
        "eng_stemmer": {
          "type": "stemmer",
          "name": "english"
        }
      },
      "analyzer": {
        "email": {
          "tokenizer": "uax_url_email",
          "filter": [
            "email",
            "lowercase",
            "unique"
          ]
        },
        "eng_stemmed": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "eng_stemmer",
            "unique"
          ]
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "timestamps": {
          "type": "float"
        },
        "script_uuid": {
          "type": "keyword"
        },
        "occurrences": {
          "properties": {
            "keywords": {
              "type": "text",
              "analyzer": "eng_stemmed",
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              }
            },
            "count": {
              "type": "integer"
            }
          }
        },
        "paragraph": {
          "type": "text",
          "analyzer": "eng_stemmed",
          "fields": {
            "keyword": {
              "type": "keyword"
            }
          }
        },
        "actor": {
          "properties": {
            "id": {
              "type": "keyword"
            },
            "email": {
              "type": "text",
              "analyzer": "email",
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              }
            }
          }
        },
        "last_modified": {
          "type": "date"
        }
      }
    }
  }
}

So a script is divided into paragraphs

Queries I am trying to do:

Given a script_uuid, find keyword occurences (sum) per actor for that script.
Given a keyword and a threshold of occurences find all paragraphs across scripts where that keyword appears and also give the total count
same as 2 but for a single script_uuid

among these i am able to get the scripts where keywords are used in paragraphs, but I am not able to count them from occurrences

sample data -

POST scripts/_doc/
{
  "script_uuid": "135a04e4-fe11-11e8-8eb2-f2801f1b9fd1",
  "timestamps": [1.1,1.2,1.3,1.4,1.5,1.6],
  "paragraph": "hi, where are the new apples",
  "occurrences": [
    {
      "keywords": "hi",
      "count": 1
    },
    {
      "keywords": "new apples",
      "count": 2
    }
  ],
  "actor": {
    "id": 0,
    "email": "robert@apac.com"
  },
  "last_modified": "2019-01-02T18:49:01-05:00"
}

POST scripts/_doc/
{
  "script_uuid":"135a04e4-fe11-11e8-8eb2-f2801f1b9fd1",
  "timestamps": [2.1, 2.2, 2.3,2.4,2.5,2.6,2.7],
  "paragraph": "hi, not sure about the new apples",
  "occurrences": [
    {
      "keywords": "hi",
      "count": 1
    },
    {
      "keywords": "new apples",
      "count": 1
    },
    {
      "keywords": "not",
      "count": 1
    }
  ],
  "actor": {
    "id": 1,
    "email": "luke@rtty.com"
  },
  "last_modified": "2019-01-02T18:49:02-05:00"
}

aggregation query:

GET scripts/_search
{
  "query": {
    "bool": {
      "must": {
        "term": {
          "script_uuid": "135a04e4-fe11-11e8-8eb2-f2801f1b9fd1"
        }
      }
    }
  },
  "aggs": {
    "by_keyword": {
      "filters": {
        "filters": {
          "test_word2": {
            "match": {
              "occurrences.keywords": "test_word2"
            }
          },
          "next apples": {
            "match": {
              "occurrences.keywords": "next apples"
            }
          },
          "hi": {
            "match": {
              "occurrences.keywords": "hi"
            }
          }
        }
      },
      "aggs": {
        "sum_occurences": {
          "sum": {
            "field": "occurrences.count"
          }
        },
        "by_user": {
          "terms": {
            "field": "actor.email.keyword"
          }
        }
      }
    }
  }
}

gives result as (sum for "hi" should be 2 and not 6)

"aggregations": {
    "by_keyword": {
      "buckets": {
        "hi": {
          "doc_count": 2,
          "sum_occurences": {
            "value": 6
          },
          "by_user": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "luke@rtty.com",
                "doc_count": 1
              },
              {
                "key": "robert@apac.com",
                "doc_count": 1
              }
            ]
          }
        },
        "next apples": {
          "doc_count": 2,
          "sum_occurences": {
            "value": 6
          },
          "by_user": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "luke@rtty.com",
                "doc_count": 1
              },
              {
                "key": "robert@apac.com",
                "doc_count": 1
              }
            ]
          }
        },
        "test_word2": {
          "doc_count": 0,
          "sum_occurences": {
            "value": 0
          },
          "by_user": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        }
      }
    }
  }

I understand this is happening due to aggregating all counts from the filtered criteria, but is there a way to get sum per bucket ?

I want to show the stats for "keywords (and their count or sum) used by an actor in a given script"

Igor_Motov · January 3, 2019, 11:19pm

The field occurrences need to be nested for that and you need to use nested aggregation:

PUT scripts
{
  "settings": {
    "analysis": {
      "filter": {
        "email": {
          "type": "pattern_capture",
          "preserve_original": true,
          "patterns": [
            "([^@]+)",
            "(\\p{L}+)",
            "(\\d+)",
            "@(.+)",
            "([^-@]+)"
          ]
        },
        "eng_stemmer": {
          "type": "stemmer",
          "name": "english"
        }
      },
      "analyzer": {
        "email": {
          "tokenizer": "uax_url_email",
          "filter": [
            "email",
            "lowercase",
            "unique"
          ]
        },
        "eng_stemmed": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "eng_stemmer",
            "unique"
          ]
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "timestamps": {
          "type": "float"
        },
        "script_uuid": {
          "type": "keyword"
        },
        "occurrences": {
          "type": "nested",
          "properties": {
            "keywords": {
              "type": "text",
              "analyzer": "eng_stemmed",
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              }
            },
            "count": {
              "type": "integer"
            }
          }
        },
        "paragraph": {
          "type": "text",
          "analyzer": "eng_stemmed",
          "fields": {
            "keyword": {
              "type": "keyword"
            }
          }
        },
        "actor": {
          "properties": {
            "id": {
              "type": "keyword"
            },
            "email": {
              "type": "text",
              "analyzer": "email",
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              }
            }
          }
        },
        "last_modified": {
          "type": "date"
        }
      }
    }
  }
}

POST scripts/_doc/
{
  "script_uuid": "135a04e4-fe11-11e8-8eb2-f2801f1b9fd1",
  "timestamps": [1.1,1.2,1.3,1.4,1.5,1.6],
  "paragraph": "hi, where are the new apples",
  "occurrences": [
    {
      "keywords": "hi",
      "count": 1
    },
    {
      "keywords": "new apples",
      "count": 2
    }
  ],
  "actor": {
    "id": 0,
    "email": "robert@apac.com"
  },
  "last_modified": "2019-01-02T18:49:01-05:00"
}

POST scripts/_doc/
{
  "script_uuid":"135a04e4-fe11-11e8-8eb2-f2801f1b9fd1",
  "timestamps": [2.1, 2.2, 2.3,2.4,2.5,2.6,2.7],
  "paragraph": "hi, not sure about the new apples",
  "occurrences": [
    {
      "keywords": "hi",
      "count": 1
    },
    {
      "keywords": "new apples",
      "count": 1
    },
    {
      "keywords": "not",
      "count": 1
    }
  ],
  "actor": {
    "id": 1,
    "email": "luke@rtty.com"
  },
  "last_modified": "2019-01-02T18:49:02-05:00"
}

GET scripts/_search
{
  "size": 0,
  "query": {
    "bool": {
      "must": {
        "term": {
          "script_uuid": "135a04e4-fe11-11e8-8eb2-f2801f1b9fd1"
        }
      }
    }
  },
  "aggs": {
    "occurrences": {
      "nested": {
        "path": "occurrences"
      },
      "aggs": {
        "by_keyword": {
          "filters": {
            "filters": {
              "test_word2": {
                "match": {
                  "occurrences.keywords": "test_word2"
                }
              },
              "next apples": {
                "match": {
                  "occurrences.keywords": "next apples"
                }
              },
              "hi": {
                "match": {
                  "occurrences.keywords": "hi"
                }
              }
            }
          },
          "aggs": {
            "sum_occurences": {
              "sum": {
                "field": "occurrences.count"
              }
            },
            "by_user": {
              "terms": {
                "field": "actor.email.keyword"
              }
            }
          }
        }
      }
    }
  }
}

maheshgawali · January 7, 2019, 11:37am

Thanks, works using nested

system · February 4, 2019, 11:37am

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.