Need advices to ingest this type of data

florentmair · February 27, 2017, 2:38pm

Hi

I have daily results from an audit and i am not sure how i should manage the data.
All i want is to visualise the evolution of the number of documents and keep the structure

i want to be able to visualise the numberof document by type or format, the evolution of this number, ...

So i have two questions:

first, i am trying to parse the log file in the multiline mode with this pattern

input {
file {
path => "/tmp/input"
codec => multiline {
pattern => "^\s"
what => "previous"
}
}
}
this is working, but after i don't know what the best solution to manage the result, should i have to use split, kv ?
Next, for the restitution should i organize the data by:
Field = nb_doc_by_type_{doc_type}, value => the count
Field = nb_doc_by_format_{doc_format}, value => the count
or maybe a hash is better ?

Sample of the audit log:

Number of Documents by Type: 
 
  Document Type                            Count
 
  da_document                          2,610,975
  dm_cabinet                              26,182
  dm_document                             13,687
  dm_folder                                5,215
  dm_sysobject                             2,427
  dm_smart_list                            1,503
  da_fld_replication                       1,070
  da_export_config_data                      513
  dm_job_request                             414
  dm_app_ref                                 218
  Total:                         -------------
                                       2,670,487
 
Number of Documents by Format: 
 
  Document Format                          Count
 
  excel8book                           1,089,879
  msw8                                   714,959
  pdf                                    514,204
  crtext                                 107,936
  msw12                                   49,236
  tiff                                    48,163
  ppt8                                    29,525
  zip                                     28,703
  text                                     9,094
  excel12book                              7,692

  Total:                         -------------
                                       2,624,756

Thanks

florentmair · March 14, 2017, 2:08pm

i have now this filter

filter {
  if [type] == "stateofdocbase" {
    grok {
        match => { "path" => "RunJob/%{DATA:docbase}/RunJob_%{DATA:timestamp}s_dm_StateOfDocbase.log" }
    }
    date {
        match => [ "timestamp" , "yy_MM_dd_HH'h'mm'm'ss" ]
        add_field => [ "IndexType", "dmt" ]
        add_field => [ "IndexRotation", "none" ]
        remove_field  => [ "timestamp" ]
    }

    if [message] =~ /^NUMBER OF .*:/ {
        grok {
             match => { "message" => "^NUMBER OF %{DATA:fieldinfo} :"}
        }

        mutate {
           gsub => [ "fieldinfo", " ", "_" ]
           gsub => [ "message" , "," , "" ]
        }

        kv {
           field_split => "\n"
           target => "dynamic_name"
           trim => " "
           trimkey => " <>"
        }

        mutate {
           rename => { "dynamic_name" => "NB_%{fieldinfo}" }
           remove_field  => [ "fieldinfo" , "message"]
        }

    } else if [message] =~ /GTotal Content/ {
        mutate {
           gsub => [ "message" , "," , "" ]
        }
        grok {
             match => { "message" => "GTotal Content =\s+%{DATA:NB_GTOTAL_CONTENT}\s*\n" }
             remove_field  => ["message"]
        }
        mutate {
           convert => { "NB_GTOTAL_CONTENT" => "integer" }
        }
    } else {
        drop{}
    }
    mutate {
      remove_tag => [ "multiline" ]
    }
  }
}

But i have a problem with the restitution in Kibana, because my data are stored like that

{
  "_index": "logstash-dmt",
  "_type": "stateofdocbase",
  "_id": "AVrJTMRHPMa1Ahhz2UYM",
  "_score": null,
  "_source": {
"IndexName": "logstash-dmt",
"@timestamp": "2017-03-12T21:30:02.000Z",
"NB_DOCUMENTS_BY_FORMAT": {
  "bmp": "1",
  "csv": "661",
  "msw6template": "10",
  "java": "4",
  "excel12bbook": "34",
  "excel12mebook": "1361",
  "PDF_StampTest": "4",
  "text": "12232",
  "excel5book": "8",
  "powerpoint": "2",
  "zip": "22678",
  "excel12book": "3798",
  "ppt8_template": "41",
  "ppt12": "3976",
  "excel8book": "821179",
  "ppt8": "26006",
  "pdf": "427521"
},
"@version": "1",
"host": "XXX",
"IndexRotation": "none",
"type": "stateofdocbase",
"IndexType": "dmt",
"tags": [
  "multiline"
]
  },
  "fields": {
"@timestamp": [
  1489354202000
]
  },
  "sort": [
1489354202000
  ]
}

What should i do in this case ?
Thanks

system · April 11, 2017, 2:08pm

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
How to parse mix json logs Logstash	28	4580	March 25, 2019
Denormalize data within a log file Logstash	25	2070	August 27, 2018
Parsing csv file through Logstash Logstash	18	2304	July 9, 2021
Inject data from a log file Logstash	8	2915	December 26, 2017
Ingesting table-like data Logstash	6	725	May 30, 2017

Need advices to ingest this type of data

Related topics