Hi @przemekwitek
- How do you achieve concatenation? Do you use a script or runtime field? -> Concatenation with pipeline processor
- Do you have two separate group-by clauses for first and last names?
Or just one for fullName
? -> Separate firstName/lastName fields
- Could you provide full transform config and a few rows from the source index?
source-index:
"hits": [
{
"_index": "person-data-index",
"_id": "x-V2HI0BU2N54XdmKHXE",
"_score": 1,
"_source": {
"firstName": "FirstName1",
"lastName": "LastName1",
"zipCode": null,
"personId": "personId",
"ingest_timestamp": "2024-01-22T12:29:45.480638977Z",
"source": "source-1"
}
},
{
"_index": "person-data-index",
"_id": "jfl2HI0BWquxb6JedpIc",
"_score": 1,
"_source": {
"firstName": "FirstName2",
"lastName": "LastName2",
"zipCode": "12345",
"personId": "personId-1",
"ingest_timestamp": "2024-01-18T12:05:42.043752715Z",
"source": "source-1"
}
},
{
"_index": "person-data-index",
"_id": "yOV2HI0BU2N54Xdmj3UR",
"_score": 1,
"_source": {
"firstName": "FirstName2",
"lastName": "LastName2",
"zipCode": "12345",
"personId": "personId-1",
"ingest_timestamp": "2024-01-18T12:05:48.433166779Z",
"source": "source-2"
}
},
{
"_index": "person-data-index",
"_id": "jPl2HI0BWquxb6JeUJKg",
"_score": 1,
"_source": {
"firstName": "FirstName1",
"lastName": "LastName1",
"zipCode": null,
"personId": "personId",
"ingest_timestamp": "2024-01-22T12:30:45.237586836Z",
"source": "source-2"
}
}
]
transform:
{
"id": "dest-transformation-personal-data-001",
"authorization": {
"roles": [
"superuser"
]
},
"version": "8.7.0",
"create_time": 1705925962383,
"source": {
"index": [
"transformed-personal-data-index"
],
"query": {
"bool": {
"must": [
{
"exists": {
"field": "combined.firstName"
}
},
{
"exists": {
"field": "combined.lastName"
}
},
{
"exists": {
"field": "combined.zipCode"
}
}
]
}
}
},
"dest": {
"index": "dest-personal-data-index",
"pipeline": "ingest_timestamp_pipeline"
},
"frequency": "1m",
"sync": {
"time": {
"field": "ingest_timestamp",
"delay": "60s"
}
},
"pivot": {
"group_by": {
"combined.concatenatedPersonalData": {
"terms": {
"field": "combined.concatenatedPersonalData"
}
}
},
"aggregations": {
"values": {
"scripted_metric": {
"init_script": "\n state.doc = [];\n ",
"map_script": "\n state.doc.add(params['_source']['combined']);\n ",
"combine_script": "return state",
"reduce_script": "\n def result = [];\n for (state in states) {\n for (s in state.doc) {\n result.add(s);\n }\n }\n return result;\n "
}
}
}
},
"settings": {
"max_page_search_size": 500
}
}
result index:
"hits": [
{
"_index": "dest-personal-data-index",
"_id": "RoIoqTPJe2AGm__QmnP8A2UAAAAAAAAA",
"_score": 1,
"_source": {
"ingest_timestamp": "2024-01-22T12:19:23.322437191Z",
"values": [
{
"firstName": "FirstName1",
"lastName": "LastName1",
"zipCode": "123456",
"personId": "personId",
"ingest_timestamp": "2024-01-18T12:05:32.448356717Z",
"concatenatedPersonalData": "FirstName1|LastName1|123456",
"source": "source-2"
}
],
"combined": {
"concatenatedPersonalData": "FirstName1|LastName1|123456"
}
}
},
{
"_index": "dest-personal-data-index",
"_id": "RpRt_CQd6HrBNOdLZGKGfR4AAAAAAAAA",
"_score": 1,
"_source": {
"ingest_timestamp": "2024-01-22T12:19:23.322574528Z",
"values": [
{
"firstName": "FirstName2",
"lastName": "LastName2",
"zipCode": "12345",
"personId": "personId-1",
"ingest_timestamp": "2024-01-18T12:05:48.433166779Z",
"concatenatedPersonalData": "FirstName2|LastName2|12345",
"source": "source-2"
}
],
"combined": {
"concatenatedPersonalData": "FirstName2|LastName2|12345"
}
}
}
]