Need advices to ingest this type of data

Hi

I have daily results from an audit and i am not sure how i should manage the data.
All i want is to visualise the evolution of the number of documents and keep the structure

i want to be able to visualise the numberof document by type or format, the evolution of this number, ...

So i have two questions:

  • first, i am trying to parse the log file in the multiline mode with this pattern

    input {
    file {
    path => "/tmp/input"
    codec => multiline {
    pattern => "^\s"
    what => "previous"
    }
    }
    }
    this is working, but after i don't know what the best solution to manage the result, should i have to use split, kv ?

  • Next, for the restitution should i organize the data by:
    Field = nb_doc_by_type_{doc_type}, value => the count
    Field = nb_doc_by_format_{doc_format}, value => the count
    or maybe a hash is better ?

Sample of the audit log:

Number of Documents by Type: 
 
  Document Type                            Count
 
  da_document                          2,610,975
  dm_cabinet                              26,182
  dm_document                             13,687
  dm_folder                                5,215
  dm_sysobject                             2,427
  dm_smart_list                            1,503
  da_fld_replication                       1,070
  da_export_config_data                      513
  dm_job_request                             414
  dm_app_ref                                 218
  Total:                         -------------
                                       2,670,487
 
Number of Documents by Format: 
 
  Document Format                          Count
 
  excel8book                           1,089,879
  msw8                                   714,959
  pdf                                    514,204
  crtext                                 107,936
  msw12                                   49,236
  tiff                                    48,163
  ppt8                                    29,525
  zip                                     28,703
  text                                     9,094
  excel12book                              7,692

  Total:                         -------------
                                       2,624,756

Thanks

i have now this filter

filter {
  if [type] == "stateofdocbase" {
    grok {
        match => { "path" => "RunJob/%{DATA:docbase}/RunJob_%{DATA:timestamp}s_dm_StateOfDocbase.log" }
    }
    date {
        match => [ "timestamp" , "yy_MM_dd_HH'h'mm'm'ss" ]
        add_field => [ "IndexType", "dmt" ]
        add_field => [ "IndexRotation", "none" ]
        remove_field  => [ "timestamp" ]
    }

    if [message] =~ /^NUMBER OF .*:/ {
        grok {
             match => { "message" => "^NUMBER OF %{DATA:fieldinfo} :"}
        }

        mutate {
           gsub => [ "fieldinfo", " ", "_" ]
           gsub => [ "message" , "," , "" ]
        }

        kv {
           field_split => "\n"
           target => "dynamic_name"
           trim => " "
           trimkey => " <>"
        }

        mutate {
           rename => { "dynamic_name" => "NB_%{fieldinfo}" }
           remove_field  => [ "fieldinfo" , "message"]
        }

    } else if [message] =~ /GTotal Content/ {
        mutate {
           gsub => [ "message" , "," , "" ]
        }
        grok {
             match => { "message" => "GTotal Content =\s+%{DATA:NB_GTOTAL_CONTENT}\s*\n" }
             remove_field  => ["message"]
        }
        mutate {
           convert => { "NB_GTOTAL_CONTENT" => "integer" }
        }
    } else {
        drop{}
    }
    mutate {
      remove_tag => [ "multiline" ]
    }
  }
}

But i have a problem with the restitution in Kibana, because my data are stored like that

{
  "_index": "logstash-dmt",
  "_type": "stateofdocbase",
  "_id": "AVrJTMRHPMa1Ahhz2UYM",
  "_score": null,
  "_source": {
"IndexName": "logstash-dmt",
"@timestamp": "2017-03-12T21:30:02.000Z",
"NB_DOCUMENTS_BY_FORMAT": {
  "bmp": "1",
  "csv": "661",
  "msw6template": "10",
  "java": "4",
  "excel12bbook": "34",
  "excel12mebook": "1361",
  "PDF_StampTest": "4",
  "text": "12232",
  "excel5book": "8",
  "powerpoint": "2",
  "zip": "22678",
  "excel12book": "3798",
  "ppt8_template": "41",
  "ppt12": "3976",
  "excel8book": "821179",
  "ppt8": "26006",
  "pdf": "427521"
},
"@version": "1",
"host": "XXX",
"IndexRotation": "none",
"type": "stateofdocbase",
"IndexType": "dmt",
"tags": [
  "multiline"
]
  },
  "fields": {
"@timestamp": [
  1489354202000
]
  },
  "sort": [
1489354202000
  ]
}

What should i do in this case ?
Thanks

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.