hello everyone, I've been having a little difficulty dealing with this collection of files I'm trying to get indexed into the elk stack. As of the current conf file the system is parsing everything into the system and adding the appropriate tags to the data however as you will see that the original document contains multiple fields, and as currently configured each key value pair produces it's on document. I've come to the conclusion that I'm apparently making this more difficult that I should be so now it's time to reach out to the better minds out there for assistance.
In the interest of saving space I have removed most of the value fields. most note worthy is that the post_content field holds html make up which is the reason for the regex expressions.
{
"ID" : 12187,
"post_author" : 24562,
"post_date" : "2015-06-15 16:27:59",
"post_date_gmt" : "2015-06-15 20:27:59",
"post_content" : "contents of article containing html markup",
"post_title" : "title of article plain text",
"post_excerpt" : "",
"post_status" : "publish",
"comment_status" : "open",
"ping_status" : "open",
"post_password" : "",
"post_name" : "post name, plain text",
"to_ping" : "",
"pinged" : "",
"post_modified" : "2016-08-02 19:34:58",
"post_modified_gmt" : "2016-08-02 23:34:58",
"post_content_filtered" : "",
"post_parent" : 0,
"guid" : "https address",
"menu_order" : 0,
"post_type" : "msc",
"post_mime_type" : "",
"comment_count" : 22
},
{
"ID" : 12544,
"post_author" : 113708,
"post_date" : "2015-06-17 16:20:01",
"post_date_gmt" : "2015-06-17 20:20:01",
"post_content" : "",
"post_title" : "Automatas Finitos",
"post_excerpt" : "",
"post_status" : "publish",
"comment_status" : "open",
"ping_status" : "open",
"post_password" : "",
"post_name" : "automatas-finitos",
"to_ping" : "",
"pinged" : "",
"post_modified" : "2016-02-24 19:48:23",
"post_modified_gmt" : "2016-02-25 00:48:23",
"post_content_filtered" : "",
"post_parent" : 0,
"guid" : " https web addresss",
"menu_order" : 0,
"post_type" : "msc",
"post_mime_type" : "",
"comment_count" : 0
}
]
here is the configuration file, nothing particularly interesting however I'm assuming I'm either using the wrong plugins or have them configured incorrectly.
input {
s3 {
access_key_id => "redacted"
secret_access_key => "redacted"
bucket => "bucket_name"
region => "us-east-1"
prefix => "/"
delete => "true"
enable_metric => "false"
}
}
filter {
mutate {
gsub => [ "message", "(?<=<post_content>[\\",])|(<(.*?)>)|(\s)", " " ]
}
kv {
source => "message"
target => "[dataclass][tags]"
field_split => ","
value_split => ":"
#trim => "([[{\]\}])"
trimkey => "(\\\")|(\s)"
remove_field => ["message"]
include_brackets => "true"
include_keys => ["ID", "post_author", "post_date", "post_date_gmt", "post_content", "post_title", "post_excerpt", "post_status", "comment_status", "ping_status",
"post_password", "post_name", "to_ping", "pinged", "post_modified", "post_modified_gmt", "post_content_filtered", "post_parent", "guid", "menu_order",
"post_type", "post_mime_type", "comment_count"]
}
}
output {
elasticsearch {
user => "redacted"
password => "redacted"
hosts => "localhost:9200"
index => "logstash-articles"
document_type => "articles"
}
}
currently it is returning a json for each field however the fields are adding correctly (an example)
{
"_index": "logstash-articles",
"_type": "articles",
"_id": "AVrx64omhUtoxq_cTdpn",
"_score": null,
"_source": {
"@timestamp": "2017-03-21T17:30:29.748Z",
"dataclass": {
"tags": {
"ID": "85946"
}
},
"@version": "1"
},
"fields": {
"@timestamp": [
1490117429748
]
},
"sort": [
1490117429748
]
}
thanks for any advice in advance