getting this error when trying to index using logstash:
warning: thread "[main]>worker1" terminated with exception (report_on_exception is true):
java.lang.OutOfMemoryError: UTF16 String size is 1371255266, should be less than 1073741823
also, the customer_id is unique in the customer table, however the logstash is repeating the records more than 500 times per same customer_id
========
Heap size is 32 GB, server has 64 GB memory
We are attempting to index 1 terabyte of data, which is about 500,000 of our records.
Our logstash script is below. Any help with the java.lang.OutofMemoryError and repeating rows would be greatly appreciated.
input {
jdbc {
jdbc_connection_string => "jdbc:sqlserver://SERVERA-DB:1433;databaseName=db_1;integratedsecurity=true;"
jdbc_driver_class => "com.microsoft.sqlserver.jdbc.SQLServerDriver"
jdbc_user => "bob_hanson"
jdbc_paging_enabled => "true"
jdbc_page_size => "5000"
statement => " SELECT customer_id, first_name, middle_initial, last_name, address1, address2, city, zip_code
from [customers]
tags => ["customer_search_tags"]
}
}
filter {
jdbc_streaming {
jdbc_connection_string => "jdbc:sqlserver://SERVERA-DB:1433;databaseName=db_1;integratedsecurity=true;"
jdbc_driver_class => "com.microsoft.sqlserver.jdbc.SQLServerDriver"
jdbc_user => "bob_hanson"
statement => " SELECT document_id, document, document_content_type, document_name FROM Customer_Documents cd
WHERE document is not null AND document_content_type in ('application/pdf', 'application/msword', 'text/plain','application/atom+xml','application/msaccess','application/msexcel','application/vnd.ms-excel','application/vnd.ms-officetheme','application/vnd.ms-outlook',
'application/vnd.ms-powerpoint','application/vnd.ms-word.document.macroEnabled.12','application/vnd.openxmlformats-officedocument.pres','application/vnd.openxmlformats-officedocument.spre','application/vnd.openxmlformats-officedocument.word','application/vnd.visio','application/x-cpy',
'message/rfc822','text/css','text/html', 'text/xml') AND cd.customer_id = :cc_customer_id"
parameters => { "cc_customer_id" => "customer_id"}
target => "attachments"
}
}
output {
elasticsearch {
hosts => [https://XXXXXXXXXXXX:9200]
cacert => "E:\logstash\config\blog_cert.pem"
ssl => true
ssl_certificate_verification => false
pipeline => "attachments"
index => "customer_search_entries"
user => "elastic"
password => "XXXXXXXX=="
}
}