Unable to index Html content in Ealsticsearch

Hello all,
I am trying to index data which is having some html content.
I tried to use 'html_strip' but its throwing error.Can you please help me in indexing as I am trying to google things but nothing is helping me.

PUT test
{
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"analysis": {
"analyzer": {
"ch_analyzer": {
"tokenizer": "standard",
"char_filter": [
"html_strip"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"content.links": {
"type": "text",
"analyzer": "ch_analyzer"
}
}
}
}
}

input:

{"index":{"_type":"_doc","_id":"1486039882869"}}
{"asset.isflex":"true","page.isbreadcrumbvisible":"true","test.assetid":"Page:1486039882869","asset.type_description":"Page","page.footer":"Content:1486039854710","asset.type":"Page","page.centralcontent":"Content:1486039859511","record.spec":"Page:1486039882869","asset.subtype":"GenericPageDefinition","page.isleftmenu":"false","page.labelbreadcrumb":"Mobile","Endeca.Action":"UPSERT","asset.type_name":"Page","page.name":"Business_main_mobile","page.header":"Content:1486039843374","asset.id":"1486039882869","asset.type_subtype":"GenericPageDefinition","asset.type_plural":"Pages","asset.locale":"en","page.description":"","asset.typename":"Page","page.template":"PageTemplate"}
{"index":{"_type":"_doc","_id":"1486039861928"}}
{"asset.isflex":"true","test.assetid":"Content:1486039861928","asset.type_description":"Content","content.title":"COSMOTE Business One","asset.type":"Content","record.spec":"Content:1486039861928","asset.subtype":"Menu_Content","Endeca.Action":"UPSERT","asset.type_name":"Content","content.template":"Menu_Template","content.links":"Φτιάξε το πÏ?όγÏ?αμμά σου","asset.id":"1486039861928","asset.type_subtype":"Menu_Content","asset.type_plural":"Contents","asset.locale":"gr","asset.typename":"Content","content.description":"","content.name":"Menu_Content50"}
{"index":{"_type":"_doc","_id":"1486055804091"}}
{"asset.isflex":"true","test.assetid":"Content:1486055804091","asset.type_description":"Content","asset.type":"Content","record.spec":"Content:1486055804091","asset.subtype":"contentFormDef","Endeca.Action":"UPSERT","asset.type_name":"Content","content.template":"contentMainFormTemplate","content.contentbox":"Content:1486055804389","content.contentfield":"Content:1486055805134","asset.id":"1486055804091","asset.type_subtype":"contentFormDef","asset.type_plural":"Contents","asset.locale":"gr","asset.typename":"Content","content.description":"","content.name":"Huawei_content_form"}
{"index":{"_type":"_doc","_id":"1486039835632"}}

Output

[oracle@oel64 input_data]$ curl -H "Content-Type: application/json" -XPOST "localhost:9200/test/_bulk?pretty&refresh" --data-binary "@wcssample.json"
{
"took" : 84,
"errors" : true,
"items" : [
{
"index" : {
"_index" : "test",
"_type" : "_doc",
"_id" : "1486039882869",
"_version" : 1,
"result" : "created",
"forced_refresh" : true,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 0,
"_primary_term" : 1,
"status" : 201
}
},
{
"index" : {
"_index" : "test",
"_type" : "_doc",
"_id" : "1486039861928",
"status" : 400,
"error" : {
"type" : "mapper_parsing_exception",
"reason" : "failed to parse",
"caused_by" : {
"type" : "json_parse_exception",
"reason" : "Unexpected character ('>' (code 62)): was expecting comma to separate Object entries\n at [Source: org.elasticsearch.common.bytes.BytesReference$MarkSupportingStreamInputWrapper@1d11e3c1; line: 1, column: 414]"
}
}
}
},
{
"index" : {
"_index" : "test",
"_type" : "_doc",
"_id" : "1486055804091",
"_version" : 1,
"result" : "created",
"forced_refresh" : true,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 1,
"_primary_term" : 1,
"status" : 201
}
}
]
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.