Hi @Irina_Truong,
i am using a index template for the mapping. The template looks like this:
"template": {
"settings": {
"index": {
"analysis": {
"filter": {
"delimiter_filter": {
"catenate_all": "true",
"type": "word_delimiter"
},
"space_remover": {
"pattern": """\s""",
"type": "pattern_replace",
"replacement": ""
},
"german_stemmer": {
"type": "stemmer",
"language": "light_german"
},
"prefix_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "15"
}
},
"analyzer": {
"prefix_analyzer": {
"filter": [
"prefix_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"german_stem_analyzer": {
"filter": [
"lowercase",
"german_stemmer"
],
"type": "custom",
"tokenizer": "standard"
},
"joined_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"space_remover"
],
"type": "custom",
"tokenizer": "join_tokenizer"
},
"delimiter_analyzer": {
"filter": [
"delimiter_filter",
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "whitespace"
}
},
"tokenizer": {
"join_tokenizer": {
"pattern": """(?=(^|\s)(\w+\s\w+))""",
"type": "pattern",
"group": "2"
}
}
},
"default_pipeline": "adesso-norconex-xml-pipeline"
}
},
"mappings": {
"_routing": {
"required": false
},
"numeric_detection": false,
"dynamic_date_formats": [
"strict_date_optional_time",
"yyyy/MM/dd HH:mm:ss Z||yyyy/MM/dd Z"
],
"_source": {
"excludes": [],
"includes": [],
"enabled": true
},
"dynamic": true,
"dynamic_templates": [],
"date_detection": true,
"properties": {
"teaserSubline_stored_only": {
"type": "text",
"fields": {
"delimiter": {
"analyzer": "delimiter_analyzer",
"type": "text"
},
"joined": {
"analyzer": "joined_analyzer",
"type": "text"
},
"prefix": {
"analyzer": "prefix_analyzer",
"type": "text"
},
"stem": {
"analyzer": "german_stem_analyzer",
"type": "text"
}
}
},
"link": {
"type": "keyword"
},
"teaserPicture474x474_stored_only": {
"type": "keyword"
},
"colorSchemeCssClass_stored_only": {
"type": "keyword"
},
"display_content": {
"type": "text",
"fields": {
"delimiter": {
"analyzer": "delimiter_analyzer",
"type": "text"
},
"joined": {
"analyzer": "joined_analyzer",
"type": "text"
},
"prefix": {
"analyzer": "prefix_analyzer",
"type": "text"
},
"stem": {
"analyzer": "german_stem_analyzer",
"type": "text"
}
}
},
"title": {
"type": "text",
"fields": {
"delimiter": {
"analyzer": "delimiter_analyzer",
"type": "text"
},
"joined": {
"analyzer": "joined_analyzer",
"type": "text"
},
"prefix": {
"analyzer": "prefix_analyzer",
"type": "text"
},
"stem": {
"analyzer": "german_stem_analyzer",
"type": "text"
}
}
},
"content": {
"type": "text",
"fields": {
"delimiter": {
"analyzer": "delimiter_analyzer",
"type": "text"
},
"joined": {
"analyzer": "joined_analyzer",
"type": "text"
},
"prefix": {
"analyzer": "prefix_analyzer",
"type": "text"
},
"stem": {
"analyzer": "german_stem_analyzer",
"type": "text"
}
}
},
"source_url": {
"type": "keyword"
},
"content_type_multi_keyword": {
"type": "keyword"
},
"@timestamp": {
"type": "date"
},
"teaserPicture948x948_stored_only": {
"type": "keyword"
},
"site_multi_keyword": {
"type": "keyword"
},
"teaserPicture750x500_stored_only": {
"type": "keyword"
},
"headlines": {
"type": "text",
"fields": {
"delimiter": {
"analyzer": "delimiter_analyzer",
"type": "text"
},
"joined": {
"analyzer": "joined_analyzer",
"type": "text"
},
"prefix": {
"analyzer": "prefix_analyzer",
"type": "text"
},
"stem": {
"analyzer": "german_stem_analyzer",
"type": "text"
}
}
},
"branche_id": {
"type": "keyword"
},
"topic": {
"type": "keyword"
},
"mime_type_multi_keyword": {
"type": "keyword"
},
"id": {
"type": "keyword"
},
"teaserPicture1500x1000_stored_only": {
"type": "keyword"
},
"language_multi_keyword": {
"type": "keyword"
},
"branche_name": {
"type": "keyword"
},
"crawldate": {
"type": "date"
}
}
}
}
The processors of the pipeline are the following, in case that is the reason for the error:
[
{
"script": {
"source": "for (int i=0; i<ctx.keys.length; i++){\n ctx[ctx.keys[i]] = ctx.values[i]\n}"
}
},
{
"split": {
"field": "headlines",
"separator": "\\n"
}
},
{
"date": {
"field": "date_l",
"formats": [
"UNIX_MS"
]
}
},
{
"remove": {
"field": [
"values",
"keys",
"date_l",
"date_date"
],
"ignore_missing": true
}
},
{
"pipeline": {
"name": "fix-adesso-link"
}
},
{
"set": {
"field": "id",
"copy_from": "_id"
}
}
]
Since the documents are coming in with two fields
keys= [key1, key2, key3]
values= [value1, value2, value3]
i am using a script processor to create proper fields for the document.