Current config, using multiline:
input {
file {
path => [
"/opt/uspto/*.xml"
]
start_position => "beginning"
#user for testing
sincedb_path => "/dev/null"
# set this sincedb path when not testing
#sincedb_path => "/opt/logstash/tmp/sincedb"
exclude => "*.gz"
type => "xml"
codec => multiline {
#pattern => "<wo-ocr-published-application"
pattern => "<?xml version=\"1.0\" encoding=\"UTF-8\"\?>"
negate => "true"
what => "previous"
max_lines => 3000
}
}
}
filter {
if "multiline" in [tags] {
mutate {
gsub => [
# replace <p> with a blank
"message", "<p\s+id=\"\S+\"\s+num=\"\S+\">", "",
# replace </p> with a new line
"message", "</p>", "\n",
# replace <claim-text> with a blank
"message", "<claim-text>", "",
# replace </p> with a new line
"message", "</claim-text>", "\n"
]
}
mutate {
# add some new fields which we will populate with parsed data in the replace section
add_field => {
"country" => ""
"docnumber" => ""
"kind" => ""
"date" => ""
"title" => ""
"abstract" => ""
"applicants" => ""
}
# i believe this would just create a field like 'claims.content => "claims.content"
# Need to pull the data out of one field and create a new field with the actual content
#replace => [ "[xmldata][claims][0][claim][0][content]", "%{[xmldata][claims][0][claim][0][content]}" ]
#replace => [ "[xmldata][country]", "%{[xmldata][country]}" ]
}
grok {
patterns_dir => ["/etc/logstash/patterns"]
# identify the content between <claims lang""> and </claims>
match => [ "message", "%{WIPOCLAIMS:claims_data}" ]
# identify the two digit text between <claims lang""
match => [ "message", "%{WIPOCLAIMSLANG:claims_language}" ]
}
xml {
source => "message"
#store_xml => false
target => "xmldata"
xpath => [
"/us-patent-application/us-bibliographic-data-application/publication-reference/document-id/country/text()", "country",
"/us-patent-application/us-bibliographic-data-application/publication-reference/document-id/doc-number/text()", "docnumber",
"/us-patent-application/us-bibliographic-data-application/publication-reference/document-id/kind/text()", "kind",
"/us-patent-application/us-bibliographic-data-application/publication-reference/document-id/date/text()", "date",
"/us-patent-application/us-bibliographic-data-application/invention-title/text()", "title",
"/us-patent-application/abstract/text()", "abstract"
#"concat(//applicant/addressbook/last-name/text(),',',//applicant/addressbook/first-name/text())", "applicants" # this works, but creates multiple records
#"string-join(//applicant/addressbook/(concat(last-name/text(), ',', first-name/text())), ' ')", "applicants" # only in Xpath2.0
#"/us-patent-application//us-bibliographic-data-application/applicant/addressbook/last-name/text()", "applicants" # only gets one value. Need multiple
]
}
}
}
output {
elasticsearch {
codec => json
hosts => "removed:443"
index => "uspto"
}
stdout {
codec => rubydebug
}
}
Is it possible for me to iterate over the XML nodes in Ruby, possibly creating an array and then place that into a field?
I'm open to any ideas which might get the job done and be reliable.
Thanks,