Fast way to find recent unique fields

(Ivan) #1

I'm using Jaeger for tracing with Elasticsearch 6.5.2 as the backend. Mapping looks like this:

mapping
{
  "jaeger-span-2019-04-16" : {
    "mappings" : {
      "span" : {
        "_all" : {
          "enabled" : false
        },
        "dynamic_templates" : [
          {
            "span_tags_map" : {
              "path_match" : "tag.*",
              "mapping" : {
                "ignore_above" : 256,
                "type" : "keyword"
              }
            }
          },
          {
            "process_tags_map" : {
              "path_match" : "process.tag.*",
              "mapping" : {
                "ignore_above" : 256,
                "type" : "keyword"
              }
            }
          }
        ],
        "properties" : {
          "duration" : {
            "type" : "long"
          },
          "flags" : {
            "type" : "integer"
          },
          "logs" : {
            "properties" : {
              "fields" : {
                "type" : "nested",
                "dynamic" : "false",
                "properties" : {
                  "key" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  },
                  "tagType" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  },
                  "value" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  }
                }
              },
              "timestamp" : {
                "type" : "long"
              }
            }
          },
          "operationName" : {
            "type" : "keyword",
            "ignore_above" : 256
          },
          "parentSpanID" : {
            "type" : "keyword",
            "ignore_above" : 256
          },
          "process" : {
            "properties" : {
              "serviceName" : {
                "type" : "keyword",
                "ignore_above" : 256
              },
              "tag" : {
                "type" : "object"
              },
              "tags" : {
                "type" : "nested",
                "dynamic" : "false",
                "properties" : {
                  "key" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  },
                  "tagType" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  },
                  "value" : {
                    "type" : "keyword",
                    "ignore_above" : 256
                  }
                }
              }
            }
          },
          "references" : {
            "type" : "nested",
            "dynamic" : "false",
            "properties" : {
              "refType" : {
                "type" : "keyword",
                "ignore_above" : 256
              },
              "spanID" : {
                "type" : "keyword",
                "ignore_above" : 256
              },
              "traceID" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "spanID" : {
            "type" : "keyword",
            "ignore_above" : 256
          },
          "startTime" : {
            "type" : "long"
          },
          "startTimeMillis" : {
            "type" : "date",
            "format" : "epoch_millis"
          },
          "tag" : {
            "type" : "object"
          },
          "tags" : {
            "..." : "..."
          },
          "traceID" : {
            "type" : "keyword",
            "ignore_above" : 256
          }
        }
      },
      "_default_" : {
        "_all" : {
          "enabled" : false
        },
        "dynamic_templates" : [
          {
            "span_tags_map" : {
              "path_match" : "tag.*",
              "mapping" : {
                "ignore_above" : 256,
                "type" : "keyword"
              }
            }
          },
          {
            "process_tags_map" : {
              "path_match" : "process.tag.*",
              "mapping" : {
                "ignore_above" : 256,
                "type" : "keyword"
              }
            }
          }
        ]
      }
    }
  }
}

Individual documents are "spans" that have spanID and traceID. Multiple spans with the same traceID are considered a single "trace". UI has the need to find the latest X traces in some time span (let's say 1h) that match some criteria.

The current way of doing this is an aggregation like this:

{
  "aggregations": {
    "traceIDs": {
      "aggregations": {
        "startTime": {
          "max": {
            "field": "startTime"
          }
        }
      },
      "terms": {
        "field": "traceID",
        "order": [
          {
            "startTime": "desc"
          }
        ],
        "size": 20
      }
    }
  },
  "query": {
    "bool": {
      "must": [
        {
          "range": {
            "startTime": {
              "from": 1555437540000000,
              "include_lower": true,
              "include_upper": true,
              "to": 1555437600000000
            }
          }
        },
        {
          "match": {
            "process.serviceName": {
              "query": "nginx-ssl"
            }
          }
        }
      ]
    }
  },
  "size": 0
}

This seems reasonable, but it's unbearably slow: 20-40s to complete for 1h window. We have 4k spans/s for this service and each trace has two spans, amounting to ~330k hits (115k buckets then?).

{
  "took": 38372,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 331761,
    "max_score": 0,
    "hits": []
  },
  "other": "stuff"
}

If instead I query for the last X documents and count unique traceID myself, the time drops down to minuscule 26ms (instead of 38372ms):

{
  "took": 26,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 331761,
    "max_score": null,
    "hits": [
      "100 hits here"
    ]
  },
  "other": "stuff"
}

This is marvelous in terms of performance, but a bit lossy, since some large traces with many spans may dominate the search results (think 10k spans in one trace).

My question is how to do this sort of operation properly, so it's both faster than existing aggregation and more complete than naive search and manual unique on the client.

Related PR for Jaeger: https://github.com/jaegertracing/jaeger/pull/1475

(system) closed #2

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.