Terms aggregation split by coma

I have a bunch of Elasticsearch documents that contain information about study fields. I'm trying to aggregate the studyfields field to extract the number of "study fields" instances from the job posting. e.g. data science, web, network security, etc. Instead what I'm getting are buckets that match the title as a whole instead of the each word it the study field. e.g. "data science, web, network security", "data analyst, security network", etc.

How can I tell Elasticsearch to split the aggregation based on each word in the study fields as opposed the matching the value of the whole field.

Current query:

GET /test_index/_search
{
    "query": {
        "match_all": {}
    },
	"aggs": {
		"group_by_state": {
			"terms": {
				"field": "studyfeild"
			}
		}
	}
}

Unwanted Output:

{
  ...
  "hits": {
    "total": 63,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_by_state": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 14,
      "buckets": [{
          "key": "data science, web, network security",
          "doc_count": 6
        },{
          "key": "data analyst, network security",
          "doc_count": 6
        },
        ...
      ]
    }
  }
}

Desired Output:


{
  ...
  "hits": {
    "total": 63,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_by_state": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 14,
      "buckets": [{
          "key": "data science",
          "doc_count": 12
        },{
          "key": "web",
          "doc_count": 8
        },{
          "key": "network security",
          "doc_count": 5
        },{
          "key": "data analyst",
          "doc_count": 5
        },
        ...
      ]
    }
  }
}

Hi @Imad_Ourak

Maybe you can solve it using some script but this would affect the performance of the query.

In my opinion, you should reindex the index by creating a new field that receives an array. You would need a pipeline that transforms this string into an array. And you can use the processor during indexing so that new records are created in the form of an array.

An example:

POST idx_test/_doc
{"test_field": "data science, web, network security"}

POST idx_test/_doc
{"test_field": "data analyst, security network"}

PUT _ingest/pipeline/array_create
{
  "processors": [
    {
      "script": {
        "lang": "painless",
        "source": """
            String[] array = ctx['test_field'].splitOnToken(',');
            ArrayList list = new ArrayList();
            for(int i; i < array.length; i++) {
              list.add(array[i].trim())
            }
            ctx['field_array'] = list;
          """
      }
    }
  ]
}

POST _reindex
{
  "source": {
    "index": "idx_test"
  },
  "dest": {
    "index": "idx_test_2",
    "pipeline": "array_create"
  }
}

GET idx_test_2/_search
{
  "size": 0,
  "aggs": {
    "NAME": {
      "terms": {
        "field": "field_array.keyword",
        "size": 10
      }
    }
  }
}

Results

"aggregations": {
    "NAME": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "data analyst",
          "doc_count": 1
        },
        {
          "key": "data science",
          "doc_count": 1
        },
        {
          "key": "network security",
          "doc_count": 1
        },
        {
          "key": "security network",
          "doc_count": 1
        },
        {
          "key": "web",
          "doc_count": 1
        }
      ]
    }
  }

Thank you @RabBit_BR for replying me, the example you provided works properly, but for my data it doesn't work and I get this error:

{
  "took": 56,
  "timed_out": false,
  "total": 18166,
  "updated": 0,
  "created": 0,
  "deleted": 0,
  "batches": 1,
  "version_conflicts": 0,
  "noops": 0,
  "retries": {
    "bulk": 0,
    "search": 0
  },
  "throttled_millis": 0,
  "requests_per_second": -1,
  "throttled_until_millis": 0,
  "failures": [
    {
      "index": "orgunit_index_2",
      "id": "3",
      "cause": {
        "type": "script_exception",
        "reason": "runtime error",
        "script_stack": [
          """array = ctx['cdm_orgunit_24_studyfields.text'].splitOnToken(',');
            ArrayList """,
          "                                              ^---- HERE"
        ],
        "script": " ...",
        "lang": "painless",
        "position": {
          "offset": 68,
          "start": 22,
          "end": 110
        },
        "caused_by": {
          "type": "null_pointer_exception",
          "reason": "cannot access method/field [splitOnToken] from a null def reference"
        }
      },
      "status": 400
    },

this is the structure of mapping of "cdm_orgunit_24_studyfields":

 "cdm_orgunit_24_studyfields": {
                "properties": {
                    "lang": {
                        "type": "keyword"
                    },
                    "text": {
                        "type": "text",
                        
                        "fields": {
                            "keyword": {
                                "type": "keyword",
                                "ignore_above": 256
                            },
                            "completion": {
                                "type": "completion"
                            }
                        }
                    }
                }
            },

and this the data of this field :

 "hits": [
      {
        "_index": "orgunit_index",
        "_id": "3",
        "_score": 1,
        "_source": {
          "cdm_orgunit_24_studyfields": {
            "text": """Économie,
L'informatique,
Mathématiques,
Informatique,
Chimie,
L'histoire,
La physique,
Ingénierie informatique
+plus""",
            "lang": "fra"
          }
        }
      },
      {
        "_index": "orgunit_index",
        "_id": "12",
        "_score": 1,
        "_source": {
          "cdm_orgunit_24_studyfields": {
            "text": """Administration des affaires,
Économie,
L'informatique,
Loi,
Chimie,
Commercialisation,
La physique,
Ingénierie mécanique
+plus""",
            "lang": "fra"
          }
        }
      },

try this:

String[] array = ctx['cdm_orgunit_24_studyfields'].text.splitOnToken(',');

now i get this error :

 {
      "index": "orgunit_index_2",
      "id": "1038",
      "cause": {
        "type": "script_exception",
        "reason": "runtime error",
        "script_stack": [
          """array = ctx['cdm_orgunit_24_studyfields'].text.splitOnToken(',');
            ArrayList """,
          "                                         ^---- HERE"
        ],
        "script": " ...",
        "lang": "painless",
        "position": {
          "offset": 63,
          "start": 22,
          "end": 110
        },
        "caused_by": {
          "type": "null_pointer_exception",
          "reason": "cannot access method/field [text] from a null def reference"
        }
      },
      "status": 400
    }
  ]
}

Maybe you have some docs with cdm_orgunit_24_studyfields null. Try add check cdm_orgunit_24_studyfields is null in script.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.