So I have this XML with 30mb and I'm trying to put into Elasticsearch with logstash, each XML can have a lot of pages and each page need to be save in Elasticsearch what I have until now is:
<?xml version="1.0" ?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="tesseract v5.0.0-alpha.20200223" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image bbox 0 0 2480 3505; ppageno 0">
<div class="ocr_carea" id="block_1_37" title="bbox 187 261 2291 2001">
<p class="ocr_par" id="par_1_37" lang="por" title="bbox 187 261 2291 2001" xml:lang="por">
<span class="ocr_line" id="line_1_37" title="bbox 2016 261 2263 279; baseline -0.004 0; x_size 19.949947; x_descenders 3.9499469; x_ascenders 4.0098972">
<span class="ocrx_word" id="word_1_37" title="bbox 2016 263 2123 279; x_wconf 96">a</span>
<span class="ocrx_word" id="word_1_38" title="bbox 2133 261 2154 278; x_wconf 93">b</span>
<span class="ocrx_word" id="word_1_39" title="bbox 2163 257 2176 287; x_wconf 93">c</span>
<span class="ocrx_word" id="word_1_40" title="bbox 2178 262 2263 278; x_wconf 96">d</span>
</span>
<span class="ocr_line" id="line_1_38" title="bbox 838 293 2251 328; baseline -0.013 -0.153; x_size 19.949947; x_descenders 3.9499469; x_ascenders 4.0098972">
<span class="ocrx_word" id="word_1_41" title="bbox 838 322 847 328; x_wconf 85">f</span>
<span class="ocrx_word" id="word_1_42" title="bbox 2030 293 2251 314; x_wconf 96">g</span>
<span class="ocrx_word" id="word_1_43" title="bbox 2112 289 2135 319; x_wconf 90">h</span>
<span class="ocrx_word" id="word_1_44" title="bbox 2144 289 2156 319; x_wconf 92">i</span>
<span class="ocrx_word" id="word_1_45" title="bbox 2164 293 2251 314; x_wconf 92">j</span>
</span>
</p>
</div>
</div>
<div class="ocr_page" id="page_2" title="image bbox 0 0 2480 3505; ppageno 0">
<div class="ocr_carea" id="block_2_37" title="bbox 187 261 2291 2001">
<p class="ocr_par" id="par_2_37" lang="por" title="bbox 187 261 2291 2001" xml:lang="por">
<span class="ocr_line" id="line_2_37" title="bbox 2016 261 2263 279; baseline -0.004 0; x_size 19.949947; x_descenders 3.9499469; x_ascenders 4.0098972">
<span class="ocrx_word" id="word_2_37" title="bbox 2016 263 2123 279; x_wconf 96">k</span>
<span class="ocrx_word" id="word_2_38" title="bbox 2133 261 2154 278; x_wconf 93">l</span>
<span class="ocrx_word" id="word_2_39" title="bbox 2163 257 2176 287; x_wconf 93">m</span>
<span class="ocrx_word" id="word_2_40" title="bbox 2178 262 2263 278; x_wconf 96">n</span>
</span>
<span class="ocr_line" id="line_2_38" title="bbox 838 293 2251 328; baseline -0.013 -0.153; x_size 19.949947; x_descenders 3.9499469; x_ascenders 4.0098972">
<span class="ocrx_word" id="word_2_41" title="bbox 838 322 847 328; x_wconf 85">o</span>
<span class="ocrx_word" id="word_2_42" title="bbox 2030 293 2251 314; x_wconf 96">p</span>
<span class="ocrx_word" id="word_2_43" title="bbox 2112 289 2135 319; x_wconf 90">q</span>
<span class="ocrx_word" id="word_2_44" title="bbox 2144 289 2156 319; x_wconf 92">r</span>
<span class="ocrx_word" id="word_2_45" title="bbox 2164 293 2251 314; x_wconf 92">s</span>
</span>
</p>
</div>
</div>
</body>
</html>
my mapping is this:
{
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"properties": {
"filename": {
"type": "keyword"
},
"ocr_page": {
"type": "keyword"
},
"page_words": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"lines": {
"type": "nested",
"properties": {
"ocr_line": {
"type": "keyword"
},
"line_words": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"words": {
"type": "nested",
"properties": {
"ocrx_word": {
"type": "keyword"
},
"title": {
"type": "keyword"
},
"word": {
"type": "text",
"analyzer": "ngram_analyzer"
}
}
}
}
}
}
}
}
and my pipeline locks like this:
input {
http {
port => 8088
}
}
filter {
xml {
source => "message"
target => "xhtml"
}
ruby {
code => '
doc = []
filename = event.get("[headers][filename]")
b = event.get("[xhtml][body]")
b.each_index { |xb|
#page
d1 = b[xb]["div"]
d1.each_index { |xd1|
ocr_page = d1[xd1]["id"]
pwords = ""
lines = []
#block
d2 = d1[xd1]["div"]
d2.each_index { |xd2|
#paragraph
p = d2[xd2]["p"]
p.each_index { |xp|
#line
s1 = p[xp]["span"]
s1.each_index { |xs1|
lwords = ""
words = []
#word
s2 = s1[xs1]["span"]
s2.each_index { |xs2|
if !s2[xs2]["content"].nil?
pwords << s2[xs2]["content"]
pwords << " "
lwords << s2[xs2]["content"]
lwords << " "
word = s2[xs2]["content"].chomp(" ")
title = s2[xs2]["title"].chomp(" ")
ocrx_word = s2[xs2]["id"].chomp(" ")
words << {"word" => word, "title" => title, "ocrx_word" => ocrx_word}
end
}
if !lwords.nil? && lwords != " "
lines << {"ocr_line" => s1[xs1] ["id"], "line_words" => lwords.chomp(" "), "words" => words}
end
}
}
}
if !pwords.nil? && pwords != " "
doc << {"page_words" => pwords.chomp(" "), "filename" => filename, "ocr_page" => ocr_page, "lines" => lines}
end
}
event.set("[_doc]", doc)
}
'
}
mutate{
remove_field => ["xhtml","headers","@timestamp","host","@version","message"]
}
}
output {
elasticsearch {
index => "nome_index"
hosts => "localhost:9200"
}
stdout {
codec => rubydebug
}
}
I'm getting out of memory, if the file is bigger. Is there a better way of doing the same thing. Without increasing JVM memory. Any solution is helpful.
I already put this:
`
LS_JAVA_OPTS=" -Xmx4g -Xms4g"
`