ES-Hadoop 5.1.1 performance


(Tommaso Anzidei) #1

Hello,
we upgraded Elasticsearch to 5.1.1 from 2.1. After that upgrade we noticed a loss of performance in running hive queries against an external table linked to an Elasticsearch index. The same query used to take 30 minutes before the upgrade and now runs in more than 2 hours. Nothing was changed apart of upgrading. Is there any suggestion?

Thanks a lot

Tommaso


(James Baiera) #2

Could you please include information on the query that you are executing, as well as the mappings of the data in elasticsearch, and any other profiling information that you have collected?


(Tommaso Anzidei) #3

Hi James,
thanks a lot for your suggestions. Our mappings are:

{
"syslogfe" : {
"mappings" : {
"syslog" : {
"properties" : {
"@timestamp" : {
"type" : "date",
"format" : "strict_date_optional_time||epoch_millis"
},
"@version" : {
"type" : "string"
},
"build" : {
"type" : "string"
},
"connection-status" : {
"type" : "string"
},
"device" : {
"type" : "string"
},
"geoip" : {
"properties" : {
"area_code" : {
"type" : "long"
},
"city_name" : {
"type" : "string"
},
"continent_code" : {
"type" : "string"
},
"coordinates" : {
"type" : "geo_point",
"geohash" : true,
"geohash_prefix" : true
},
"country_code2" : {
"type" : "string"
},
"country_code3" : {
"type" : "string"
},
"country_name" : {
"type" : "string"
},
"dma_code" : {
"type" : "long"
},
"ip" : {
"type" : "string"
},
"latitude" : {
"type" : "double"
},
"location" : {
"type" : "double"
},
"longitude" : {
"type" : "double"
},
"postal_code" : {
"type" : "string"
},
"real_region_name" : {
"type" : "string"
},
"region_code" : {
"type" : "string",
"fields" : {
"keyword" : {
"type" : "string",
"index" : "not_analyzed",
"ignore_above" : 256,
"fielddata" : false
}
},
"fielddata" : false
},
"region_name" : {
"type" : "string"
},
"timezone" : {
"type" : "string"
}
}
},
"host" : {
"type" : "string"
},
"http-action" : {
"type" : "string"
},
"http-status" : {
"type" : "string"
},
"language" : {
"type" : "string"
},
"local-ip" : {
"type" : "string"
},
"major" : {
"type" : "string"
},
"message" : {
"type" : "string"
},
"minor" : {
"type" : "string"
},
"name" : {
"type" : "string"
},
"os" : {
"type" : "string"
},
"os_major" : {
"type" : "string"
},
"os_minor" : {
"type" : "string"
},
"os_name" : {
"type" : "string"
},
"patch" : {
"type" : "string"
},
"port" : {
"type" : "long"
},
"query-string" : {
"type" : "string"
},
"referer" : {
"type" : "string"
},
"remote-ip" : {
"type" : "string"
},
"request-bytes" : {
"type" : "string"
},
"response-bytes" : {
"type" : "string"
},
"secure-cipher" : {
"type" : "string"
},
"secure-protocol" : {
"type" : "string"
},
"secure-session" : {
"type" : "string"
},
"secure-user" : {
"type" : "string"
},
"tags" : {
"type" : "string"
},
"time" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
},
"time-to-serve-us" : {
"type" : "string"
},
"timestamp" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"url" : {
"type" : "string",
"index" : "not_analyzed",
"fielddata" : false
},
"user-agent" : {
"type" : "string"
}
}
}
}
}
}

which we map to an external hive table with:

CREATE EXTERNAL TABLE EXT_LOGSIS_PRD
(
event_time string,
secure-user string,
url string,
query-string string,
http-status string,
city_name string,
postal_code string,
real_region_name string
)
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES
(
'es.resource' = 'syslogfe/syslog',
'es.mapping.date.rich' = 'false',
'es.nodes' = 'http://:9200',
'es.nodes.wan.only' = 'true',
'es.mapping.names' ='
event_time:time,
city_name:geoip.city_name,
postal_code:geoip.postal_code,
real_region_name:geoip.real_region_name
'
);

and then we query that external table with:

CREATE TABLE LST_LOGSIS_PRD AS
SELECT
*
FROM EXT_LOGSIS_PRD
WHERE
YEAR(TO_DATE(event_time))= 2016
AND MONTH(TO_DATE(event_time))= 12
AND http-status IN ('200','304')
This last query used to run in around 30 minutes and now takes 4 hours to complete.
Anyway we now suppose that the update to ELK5 changed something in the config files of Elasticsearch (namely the use of JVM resources) and we are investigating it. So we are confident that the new ES-Hadoop jar is not responsible for that problem. Again thanks, the Elasticsearch community is really a supportive one!

Tommaso


(system) #4

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.