Trying to create tranform job which would not go through all documents

Hello,

Started using transform but I am struggling to find how to look at only recent documents not from the beginning (because I have 1-year historical data)

The JSON of the job is:

{
  "id": "ops_authrate_1m",
  "authorization": {
    "roles": [
      "machine_learning_admin",
      "kibana_admin",
      "superuser",
      "rollup_admin",
      "Admin",
      "monitoring_user",
      "kibana_system",
      "metricbeat_internal"
    ]
  },
  "version": "8.4.1",
  "create_time": 1677688246676,
  "source": {
    "index": [
      "opsrptlog-prod-*"
    ],
    "query": {
      "bool": {
        "should": [
          {
            "match_phrase": {
              "ops_event.transaction_type.keyword": "Card_Authorize"
            }
          }
        ],
        "minimum_should_match": 1
      }
    }
  },
  "dest": {
    "index": "prod-authrate-opsrpt"
  },
  "sync": {
    "time": {
      "field": "@timestamp",
      "delay": "60s"
    }
  },
  "pivot": {
    "group_by": {
      "ops_event.company_name": {
        "terms": {
          "field": "ops_event.company_name.keyword"
        }
      },
      "@timestamp": {
        "date_histogram": {
          "field": "@timestamp",
          "calendar_interval": "1m"
        }
      }
    },
    "aggregations": {
      "total_events": {
        "value_count": {
          "field": "ops_event.status.keyword"
        }
      },
      "processed": {
        "filter": {
          "term": {
            "ops_event.status.keyword": "Processed"
          }
        },
        "aggs": {
          "all_processed": {
            "value_count": {
              "field": "ops_event.status.keyword"
            }
          }
        }
      },
      "percentage": {
        "bucket_script": {
          "buckets_path": {
            "success": "processed>all_processed",
            "total": "total_events"
          },
          "script": "params.success / params.total * 100"
        }
      }
    }
  },
  "settings": {
    "docs_per_second": 20000
  },
  "retention_policy": {
    "time": {
      "field": "@timestamp",
      "max_age": "32d"
    }
  }
}

I guess I am looking for some range which to specify looking for example last 1 day.

Thank you

Best practice is to add a range query with an absolute start date, e.g.

"query" : {
          "range": {
            "@timestamp": {
              "gte": "2023-01-01T00:00:00"
            }
          }
        }

Using date math(e.g. now-1d) can cause performance issues, because date math causes cache misses. Use an absolute start date instead. Transform will take care of the rest, checkpointing optimizes the queries for you.

Thanks for the response.

I have created this to put in dev tools:

PUT _transform/test1
{
  "source": {
    "index": "opsrptlog-prod-*",
    "query": {
      "range": {
        "@timestamp": {
          "gte": "2023-03-01T00:00:00"
        }
      },
      "bool": {
        "should": [
          {
            "match_phrase": {
              "ops_event.transaction_type.keyword": "Card_Authorize"
            }
          }
        ],
        "minimum_should_match": 1
      }
    }
  },
  "pivot": {
    "group_by": {
      "ops_event.company_name": {
        "terms": {
          "field": "ops_event.company_name.keyword"
        }
      },
      "@timestamp": {
        "date_histogram": {
          "field": "@timestamp",
          "calendar_interval": "1m"
        }
      }
    },
    "aggregations": {
      "total_events": {
        "value_count": {
          "field": "ops_event.status.keyword"
        }
      },
      "processed": {
        "filter": {
          "term": {
            "ops_event.status.keyword": "Processed"
          }
        },
        "aggs": {
          "all_processed": {
            "value_count": {
              "field": "ops_event.status.keyword"
            }
          }
        }
      },
      "percentage": {
        "bucket_script": {
          "buckets_path": {
            "success": "processed>all_processed",
            "total": "total_events"
          },
          "script": "params.success / params.total * 100"
        }
      }
    }
  },
  "description": "Test description",
  "dest": {
    "index": "test1"
  },
  "frequency": "5m",
  "retention_policy": {
    "time": {
      "field": "@timestamp",
      "max_age": "32d"
    }
  }
}

And when I try to push it I get:

{
  "error": {
    "root_cause": [
      {
        "type": "parsing_exception",
        "reason": "[range] malformed query, expected [END_OBJECT] but found [FIELD_NAME]",
        "line": 1,
        "col": 55
      }
    ],
    "type": "x_content_parse_exception",
    "reason": "[20:5] [data_frame_transform_config] failed to parse field [source]",
    "caused_by": {
      "type": "x_content_parse_exception",
      "reason": "[20:5] [data_frame_config_source] failed to parse field [query]",
      "caused_by": {
        "type": "parsing_exception",
        "reason": "[range] malformed query, expected [END_OBJECT] but found [FIELD_NAME]",
        "line": 1,
        "col": 55
      }
    }
  },
  "status": 400
}

Thank you for looking into this.

as you already have the other query clause, you have to combine them. Both can go into filter, because your should is really a must. filter is the better must if you don't need scoring:

      "bool": {
        "filter": [
          {
            "match_phrase": {
              "ops_event.transaction_type.keyword": "Card_Authorize"
            }
          },
          {
            "range": {
              "@timestamp": {
                "gte": "2023-03-01T00:00:00"
              }
           }
         }
        ]
      }

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.