Hi, I am new to the Elastic Search and I am stuck with an issue. I am developing an application with cascading api.
I am processing 10 million rows of data with 43 columns. Now my issue is when i am dumping data to sink tap ( using default hfs sink tap) it takes me 1 -2 minutes to completely dump the data but when i use ESTap instead of hfs tap it takes 1 hour. While configuring the elastic search we configured 3 nodes with all nodes acting as master as well as data nodes and "bootstrap.memory_lock: true" , rest is kept as default settings.
Do I need to change the configuration so that the process takes less time? Please help. Thanks in Advance.
String inputPath = args[0]+File.separator+"10_million_rows.csv";
Tap inputTap = new Hfs( new TextDelimited( new Fields( "empid","gender","title","nameset","surname","city","statefull","zipcode","header1","header2","header3","header4","header5","header6","header7","header8","header9","header10","header11","header12","header13","header14","header15","header16","header17","header18","header19","header20","header21","header22","header23","header24","header25","header26","header27","header28","header29","header30","header31","header32","expr1","expr2","expr3","expr4" ) ,true , "," ), inputPath );
Pipe pipe = new Pipe("pipe");
/*
Tap sinkTap = new Hfs( new TextDelimited( new Fields( "empid","gender","title","nameset","surname","city","statefull","zipcode","header1","header2","header3","header4","header5","header6","header7","header8","header9","header10","header11","header12","header13","header14","header15","header16","header17","header18","header19","header20","header21","header22","header23","header24","header25","header26","header27","header28","header29","header30","header31","header32","expr1","expr2","expr3","expr4" ) ,true , "," ), "/hdfsdata/output" );
*/
Tap sinkTap = new EsTap("master-host",9200,"index1/type1", new Fields( "empid","gender","title","nameset","surname","city","statefull","zipcode","header1","header2","header3","header4","header5","header6","header7","header8","header9","header10","header11","header12","header13","header14","header15","header16","header17","header18","header19","header20","header21","header22","header23","header24","header25","header26","header27","header28","header29","header30","header31","header32","expr1","expr2","expr3","expr4" ));
FlowDef flowDef = FlowDef.flowDef()
.addSource(pipe,inputTap)
.addTailSink(pipe,sinkTap);
Properties properties = new Properties();
Flow flow = new Hadoop2MR1FlowConnector(properties).connect(flowDef);
flow.complete();
---------------------------------- Node1 -----------------------------------
cluster.name: electrik-io
node.name: master
node.master: true
node.data: true
path.data: "/secondary/elasticsearch/data"
path.logs: "/secondary/elasticsearch/logs"
bootstrap.memory_lock: true
bootstrap.system_call_filter: false
network.host: ["master", "localhost"]
http.port: 9200
transport.tcp.port: 9300
http.enabled: true
discovery.zen.ping.unicast.hosts: ["master", "slave1", "slave2"]
discovery.zen.minimum_master_nodes: 3
---------------------------------- Node2 -----------------------------------
cluster.name: electrik-io
node.name: slave1
node.master: true
node.data: true
path.data: "/secondary/elasticsearch/data"
path.logs: "/secondary/elasticsearch/logs"
bootstrap.memory_lock: true
bootstrap.system_call_filter: false
network.host: ["slave1", "localhost"]
http.port: 9200
transport.tcp.port: 9300
http.enabled: true
discovery.zen.ping.unicast.hosts: ["master", "slave1", "slave2"]
discovery.zen.minimum_master_nodes: 3
---------------------------------- Node3 -----------------------------------
cluster.name: electrik-io
node.name: slave2
node.master: true
node.data: true
path.data: "/secondary/elasticsearch/data"
path.logs: "/secondary/elasticsearch/logs"
bootstrap.memory_lock: true
bootstrap.system_call_filter: false
network.host: ["slave2", "localhost"]
http.port: 9200
transport.tcp.port: 9300
http.enabled: true
discovery.zen.ping.unicast.hosts: ["master", "slave1", "slave2"]
discovery.zen.minimum_master_nodes: 3