To exclude around 350 fields in JSON in Logstash

Hi,

I am trying to exclude many Keys in the JSON field. The data is actually of a Github payload. As the contributors are bit high, the details it carries is also high. For example, the total fields/kvp are around 720 but I want to shrink it it 300.

I know the field types but now sure how to mention it one by one and also is a tedious process.

I have mentioned below the JSON file. As it has arrays of data. I might have to need to mention the keys in the same format. For example :- array1[key1], array1[key1[key2]],

Almost all the arrays inside the JSON has data key name as "event_url", is there any i can mention a place where if there is a key name as "event_url" . it needs to be excluded. If you see the JSON has multiple entries with the Key value "event_url"

JSON File.

{
         "forks": 2,
        "open_issues": 2,
        "watchers": 0,
        "default_branch": "master"
      }
    },
    "_links": {
      "self": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/pulls/194"
      },
      "html": {
        "href": "https://Startup_GitLab/StartupORG/REPO_Name/pull/194"
      },
      "issue": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/issues/194"
      },
      "comments": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/issues/194/comments"
      },
      "review_comments": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/pulls/194/comments"
      },
      "review_comment": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/pulls/comments{/number}"
      },
      "commits": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/pulls/194/commits"
      },
      "statuses": {
        "href": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/statuses/cc7e20be3af91ed318cbe4e6e06da94d34ba085b"
      }
    },
    "author_association": "CONTRIBUTOR",
    "draft": false,
    "merged": false,
    "mergeable": null,
    "rebaseable": null,
    "mergeable_state": "unknown",
    "merged_by": null,
    "comments": 0,
    "review_comments": 0,
    "maintainer_can_modify": false,
    "commits": 1,
    "additions": 932,
    "deletions": 172,
    "changed_files": 24
  },
  "requested_reviewer": {
    "login": "Ernest_Young",
    "id": 3786,
    "node_id": "MDQ6VXNlcjM3ODY=",
    "avatar_url": "https://avatars.Startup_GitLab/u/3786?",
    "gravatar_id": "",
    "url": "https://Startup_GitLab/api/v3/users/Ernest_Young",
    "html_url": "https://Startup_GitLab/Ernest_Young",
    "followers_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/followers",
    "following_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/following{/other_user}",
    "gists_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/gists{/gist_id}",
    "starred_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/starred{/owner}{/repo}",
    "subscriptions_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/subscriptions",
    "organizations_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/orgs",
    "repos_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/repos",
    "events_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/events{/privacy}",
    "received_events_url": "https://Startup_GitLab/api/v3/users/Ernest_Young/received_events",
    "type": "User",
    "site_admin": false,
    "ldap_dn": "CN=young\\, eric,OU=Engineering,OU=MA Hopkinton,OU=US Users,DC=corp,DC=Company_Name,DC=com"
  },
  "repository": {
    "id": 16234,
    "node_id": "MDEwOlJlcG9zaXRvcnkxNjIzNA==",
    "name": "REPO_Name",
    "full_name": "StartupORG/REPO_Name",
    "private": true,
    "owner": {
      "login": "StartupORG",
      "id": 2356,
      "node_id": "MDEyOk9yZ2FuaXphdGlvbjIzNTY=",
      "avatar_url": "https://avatars.Startup_GitLab/u/2356?",
      "gravatar_id": "",
      "url": "https://Startup_GitLab/api/v3/users/StartupORG",
      "html_url": "https://Startup_GitLab/StartupORG",
      "followers_url": "https://Startup_GitLab/api/v3/users/StartupORG/followers",
      "following_url": "https://Startup_GitLab/api/v3/users/StartupORG/following{/other_user}",
      "gists_url": "https://Startup_GitLab/api/v3/users/StartupORG/gists{/gist_id}",
      "starred_url": "https://Startup_GitLab/api/v3/users/StartupORG/starred{/owner}{/repo}",
      "subscriptions_url": "https://Startup_GitLab/api/v3/users/StartupORG/subscriptions",
      "organizations_url": "https://Startup_GitLab/api/v3/users/StartupORG/orgs",
      "repos_url": "https://Startup_GitLab/api/v3/users/StartupORG/repos",
      "events_url": "https://Startup_GitLab/api/v3/users/StartupORG/events{/privacy}",
      "received_events_url": "https://Startup_GitLab/api/v3/users/StartupORG/received_events",
      "type": "Organization",
      "site_admin": false
    },
    "html_url": "https://Startup_GitLab/StartupORG/REPO_Name",
    "description": "Repository for CSI Plugin development project",
    "fork": false,
    "url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name",
    "forks_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/forks",
    "keys_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/keys{/key_id}",
    "collaborators_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/collaborators{/collaborator}",
    "teams_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/teams",
    "hooks_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/hooks",
    "issue_events_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/issues/events{/number}",
    "events_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/events",
    "assignees_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/assignees{/user}",
    "branches_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/branches{/branch}",
    "tags_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/tags",
    "blobs_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/git/blobs{/sha}",
    "git_tags_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/git/tags{/sha}",
    "git_refs_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/git/refs{/sha}",
    "trees_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/git/trees{/sha}",
    "statuses_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/statuses/{sha}",
    "languages_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/languages",
    "stargazers_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/stargazers",
    "contributors_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/contributors",
    "subscribers_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/subscribers",
    "subscription_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/subscription",
    "commits_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/commits{/sha}",
    "git_commits_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/git/commits{/sha}",
    "comments_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/comments{/number}",
    "issue_comment_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/issues/comments{/number}",
    "contents_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/contents/{+path}",
    "compare_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/compare/{base}...{head}",
    "merges_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/merges",
    "archive_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/{archive_format}{/ref}",
    "downloads_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/downloads",
    "issues_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/issues{/number}",
    "pulls_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/pulls{/number}",
    "milestones_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/milestones{/number}",
    "notifications_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/notifications{?since,all,participating}",
    "labels_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/labels{/name}",
    "releases_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/releases{/id}",
    "deployments_url": "https://Startup_GitLab/api/v3/repos/StartupORG/REPO_Name/deployments",
    "created_at": "2019-03-25T03:52:51Z",
    "updated_at": "2020-06-25T10:37:13Z",
    "pushed_at": "2020-06-25T13:10:16Z",
    "git_url": "git://Startup_GitLab/StartupORG/REPO_Name.git",
    "ssh_url": "git@eos2git.cec.lab.Company_Name.com:StartupORG/REPO_Name.git",
    "clone_url": "https://Startup_GitLab/StartupORG/REPO_Name.git",
    "svn_url": "https://Startup_GitLab/StartupORG/REPO_Name",
    "homepage": null,
    "size": 1770,
    "stargazers_count": 0,
    "watchers_count": 0,
    "language": "Go",
    "has_issues": true,
    "has_projects": true,
    "has_downloads": true,
    "has_wiki": true,
    "has_pages": false,
    "forks_count": 2,
    "mirror_url": null,
    "archived": false,
    "disabled": false,
    "open_issues_count": 2,
    "license": {
      "key": "apache-2.0",
      "name": "Apache License 2.0",
      "spdx_id": "Apache-2.0",
      "url": "https://Startup_GitLab/api/v3/licenses/apache-2.0",
      "node_id": "MDc6TGljZW5zZTI="
    },
    "forks": 2,
    "open_issues": 2,
    "watchers": 0,
    "default_branch": "master"
  },
  "organization": {
    "login": "StartupORG",
    "id": 2356,
    "node_id": "MDEyOk9yZ2FuaXphdGlvbjIzNTY=",
    "url": "https://Startup_GitLab/api/v3/orgs/StartupORG",
    "repos_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/repos",
    "events_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/events",
    "hooks_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/hooks",
    "issues_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/issues",
    "members_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/members{/member}",
    "public_members_url": "https://Startup_GitLab/api/v3/orgs/StartupORG/public_members{/member}",
    "avatar_url": "https://avatars.Startup_GitLab/u/2356?",
    "description": null
  },
  "enterprise": {
    "id": 1,
    "slug": "Company_Name-eos2",
    "name": "Company_Name EOS2",
    "node_id": "MDEwOkVudGVycHJpc2Ux",
    "avatar_url": "https://avatars.Startup_GitLab/b/1?",
    "description": null,
    "website_url": null,
    "html_url": "https://Startup_GitLab/enterprises/Company_Name-eos2",
    "created_at": "2019-04-28T18:39:22Z",
    "updated_at": "2019-04-28T18:39:22Z"
  },
  "sender": {
    "login": "Prakash-Dubey",
    "id": 6384,
    "node_id": "MDQ6VXNlcjYzODQ=",
    "avatar_url": "https://avatars.Startup_GitLab/u/6384?",
    "gravatar_id": "",
    "url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey",
    "html_url": "https://Startup_GitLab/Prakash-Dubey",
    "followers_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/followers",
    "following_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/following{/other_user}",
    "gists_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/gists{/gist_id}",
    "starred_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/starred{/owner}{/repo}",
    "subscriptions_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/subscriptions",
    "organizations_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/orgs",
    "repos_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/repos",
    "events_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/events{/privacy}",
    "received_events_url": "https://Startup_GitLab/api/v3/users/Prakash-Dubey/received_events",
    "type": "User",
    "site_admin": false,
    "ldap_dn": "CN=Prakash_Dubey,OU=AsiaPac,OU=_Dell,OU=International Users,DC=corp,DC=Company_Name,DC=com"
  }
}

My Current HTTP_Filter is below.

input {
 http {
    host => "ELK_Host_IP"
    port => "8080"
	type => "json"
  }
}

filter {
    json {
            source => "message"
    }
	mutate {
			remove_field => [ "events_url"] 
	} 
}

This is not valid JSON.

Do you want to remove every occurrence of an events_url field, or every object that contains an events_url field?

Due to the limitation of the 13000 Characters a thread can hold. I had to restrict the JSON file.

The full JSON file is in this URL -> https://codeshare.io/5Xxg4K

Yes, I would like to exclude all the fields(keys) or every object with the name "event_url"

And I take it when you say event_url you mean events_url?

yes, you are right. I meant the events_url which has more than one occurrences in the JSON file. Please refer the JSON file(URL) which I shared for further information.

OK, you will need to a ruby script that iterates over the event and makes recursive calls to itself for each hash and array that it finds. I have failed to write that a few times over the years, but to-day I got around to it...

Create a file called removeKeys.rb that contains

def register(params)
    @field = params['keys']
end

def removeKeys(object, name, keys, event)
#puts "removeKeys called for #{name}"
    if object
        if object.kind_of?(Hash) and object != {}
            object.each { |k, v| removeKeys(v, "#{name}[#{k}]", keys, event) }
        elsif object.kind_of?(Array) and object != []
            object.each_index { |i|
                removeKeys(object[i], "#{name}[#{i}]", keys, event)
            }
        else
            lastElement = name.gsub(/^.*\[/, "").gsub(/\]$/, "")
            if keys.include? lastElement
#puts "removing #{name}"
            event.remove(name)
            end
        end
    end
end

def filter(event)
    event.to_hash.each { |k, v|
            removeKeys(v, "[#{k}]", @field, event)
    }
    [event]
end

Then call it using

    ruby {
        path => "/home/user/removeKeys.rb"
        script_params => { keys => "events_url" }
    }

Note that keys can be an array

        script_params => { keys => [ "events_url", "comments" ] }

You probably need some error checking in the ruby code (you will know you do when logstash crashes) and my apologies if my ruby coding style makes your eyeballs bleed.

1 Like

Excellent! No wonder why ELK is preferred everywhere. Thank you so much for your prompt help. Saved me big time.

I am facing another issue in parsing a Date Keys as it shows improper format. Should I open a new case or I can continue my conversation here?

Error

[2020-06-29T16:56:14,134][WARN ][logstash.outputs.elasticsearch][main][e936d6e5a
c875ea64d8295d0f900f51b95047224c4579e7858b7991eec1d041a] Could not index event t
o Elasticsearch. {:status=>400, :action=>["index", {:_id=>nil, :_index=>"forgith
ub1", :routing=>nil, :_type=>"_doc"}, #<LogStash::Event:0xa1dddc>], :response=>{
"index"=>{"_index"=>"forgithub1", "_type"=>"_doc", "_id"=>"oHoDAXMBKraAf2wLBaXl"
, "status"=>400, "error"=>{"type"=>"mapper_parsing_exception", "reason"=>"failed
 to parse field [repository.pushed_at] of type [long] in document with id 'oHoDA
XMBKraAf2wLBaXl'. Preview of field's value: '2020-06-24T10:05:40Z'", "caused_by"
=>{"type"=>"illegal_argument_exception", "reason"=>"For input string: \"2020-06-
24T10:05:40Z\""}}}}}

I have many Keys which has timestamp in it. Is there anyway I can parse all such keys which has timestamp in such format?

A field must have the same type on every document. There are two ways elasticsearch can determine the type of a field. Either there is an index template that tells elasticsearch the type, or elasticsearch will look at the value of the field and run a list of parsers against it. If it finds a parser that successfully parses it as a certain type then that is what it sets the field type to (I am simplifying reality here). Once the type is set every subsequent document will be parsed use the parser for that type. In this case it has decided (or been told, via a template) that the field [repository][pushed_at] should be a long. However, in the document it is a string "2020-06-
24T10:05:40Z", and the parser for long throws an exception when it sees that (and the document is not indexed).

Step one is to decide what type you want the field to be -- long? string? date?

If you have a template then update it and set that field type.

If not, that would suggest you previously indexed a document in which [repository][pushed_at] looked like a long (perhaps seconds since the epoch instead of a formatted date?). In that case you will need to start with a new index and make sure the first document has the type you want.

If you want the field to be a date then I would suggest using a date filter to parse it. Something like

date { match => [ "[repository][pushed_at]", "ISO8601" ] target => "[repository][pushed_at]" }
1 Like

I just added that in my logstash filter area but without any luck! Hope I doesn't have to make any modifications to Ruby part?

Logstash File, PFB.

input {
 http {
    host => "HOST_IP"
    port => "8080"
	type => "json"
  }
}

filter {
    ruby {
            path => "C:\Users\Administrator\Desktop\TJ_ruby_Script\removeKeys.rb"
			script_params => { keys => [ "archive_url", "assignees_url", "avatar_url", "blobs_url", "branches_url", "clone_url", "collaborators_url" ] }
    }
	date { match => [ "[repository][pushed_at]", "ISO8601" ] target => "[repository][pushed_at]" }
}

output {
  stdout { codec => rubydebug { metadata => true } }
  elasticsearch {
    hosts => "ELK_IP:9200"
    index => "forgithub1"
    manage_template => false
  }
}

Did you start over with a new index?

1 Like

You asked the right question. Changed the index and it works perfect! :high_brightness: Many thanks to you! :champagne:

I am now saving the events from repositories in indexes with the help of their repository names(index names) and I need help over here, because the logstash throws error if the index names are of Upper cases. For example, a below scenario

[2020-07-04T14:22:57,720][ERROR][logstash.outputs.elasticsearch][main][431a96fad
f103bfe758d03c91265196795c6d6534f3eb8124d896efe4aa6155aassb995] Could not index event t
o Elasticsearch. {:status=>400, :action=>["index", {:_id=>nil, :_index=>"for-For
GitHub-push-2020.07.04", :routing=>nil, :_type=>"_doc"}, #<LogStash::Event:0x131
73e1>], :response=>{"index"=>{"_index"=>"for-ForGitHub-push-2020.07.04", "_type"
=>"_doc", "_id"=>nil, "status"=>400, "error"=>{"type"=>"invalid_index_name_excep
tion", "reason"=>"Invalid index name [for-ForGitHub-push-2020.07.04], must be lo
wercase", "index_uuid"=>"_na_", "index"=>"for-ForGitHub-push-2020.07.04"}}}}

In this case, ForGitHub is a repository name and it fails to push it to the index with same name as it has upper case in it.

Is there anyway I can convert the name which I am getting from the payload of Github and then use it in the output section?

I am pasting my current logstash config for your reference.

 http {
    host => "Logstash_IP"
    port => "8080"
	type => "json"
  }
}

filter {
    ruby {
            path => "C:\Users\Administrator\Desktop\My_Desktop_ruby_Script\removeKeys.rb"
			script_params => { keys => [ "archive_url", "assignees_url", "avatar_url", "blobs_url" ] }
    }
	date { match => [ "[repository][pushed_at]", "ISO8601" ] target => "[repository][pushed_at]" }
}

output {
  stdout { codec => rubydebug { metadata => true } }
  elasticsearch {
    hosts => "ELK_IP:9200"
    index => "for-%{[repository][name]}-%{[headers][x_github_event]}-%{+YYYY.MM.dd}"
    manage_template => false
  }
}
mutate { lowercase => [ "[repository][name]" ] }
1 Like