Hi James,
thanks a lot for your suggestions. Our mappings are:
{
"syslogfe" : {
"mappings" : {
"syslog" : {
"properties" : {
"@timestamp" : {
"type" : "date",
"format" : "strict_date_optional_time||epoch_millis"
},
"@version" : {
"type" : "string"
},
"build" : {
"type" : "string"
},
"connection-status" : {
"type" : "string"
},
"device" : {
"type" : "string"
},
"geoip" : {
"properties" : {
"area_code" : {
"type" : "long"
},
"city_name" : {
"type" : "string"
},
"continent_code" : {
"type" : "string"
},
"coordinates" : {
"type" : "geo_point",
"geohash" : true,
"geohash_prefix" : true
},
"country_code2" : {
"type" : "string"
},
"country_code3" : {
"type" : "string"
},
"country_name" : {
"type" : "string"
},
"dma_code" : {
"type" : "long"
},
"ip" : {
"type" : "string"
},
"latitude" : {
"type" : "double"
},
"location" : {
"type" : "double"
},
"longitude" : {
"type" : "double"
},
"postal_code" : {
"type" : "string"
},
"real_region_name" : {
"type" : "string"
},
"region_code" : {
"type" : "string",
"fields" : {
"keyword" : {
"type" : "string",
"index" : "not_analyzed",
"ignore_above" : 256,
"fielddata" : false
}
},
"fielddata" : false
},
"region_name" : {
"type" : "string"
},
"timezone" : {
"type" : "string"
}
}
},
"host" : {
"type" : "string"
},
"http-action" : {
"type" : "string"
},
"http-status" : {
"type" : "string"
},
"language" : {
"type" : "string"
},
"local-ip" : {
"type" : "string"
},
"major" : {
"type" : "string"
},
"message" : {
"type" : "string"
},
"minor" : {
"type" : "string"
},
"name" : {
"type" : "string"
},
"os" : {
"type" : "string"
},
"os_major" : {
"type" : "string"
},
"os_minor" : {
"type" : "string"
},
"os_name" : {
"type" : "string"
},
"patch" : {
"type" : "string"
},
"port" : {
"type" : "long"
},
"query-string" : {
"type" : "string"
},
"referer" : {
"type" : "string"
},
"remote-ip" : {
"type" : "string"
},
"request-bytes" : {
"type" : "string"
},
"response-bytes" : {
"type" : "string"
},
"secure-cipher" : {
"type" : "string"
},
"secure-protocol" : {
"type" : "string"
},
"secure-session" : {
"type" : "string"
},
"secure-user" : {
"type" : "string"
},
"tags" : {
"type" : "string"
},
"time" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
},
"time-to-serve-us" : {
"type" : "string"
},
"timestamp" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"url" : {
"type" : "string",
"index" : "not_analyzed",
"fielddata" : false
},
"user-agent" : {
"type" : "string"
}
}
}
}
}
}
which we map to an external hive table with:
CREATE EXTERNAL TABLE EXT_LOGSIS_PRD
(
event_time
string,
secure-user
string,
url
string,
query-string
string,
http-status
string,
city_name
string,
postal_code
string,
real_region_name
string
)
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES
(
'es.resource' = 'syslogfe/syslog',
'es.mapping.date.rich' = 'false',
'es.nodes' = 'http://:9200',
'es.nodes.wan.only' = 'true',
'es.mapping.names' ='
event_time:time,
city_name:geoip.city_name,
postal_code:geoip.postal_code,
real_region_name:geoip.real_region_name
'
);
and then we query that external table with:
CREATE TABLE LST_LOGSIS_PRD AS
SELECT
*
FROM EXT_LOGSIS_PRD
WHERE
YEAR(TO_DATE(event_time
))= 2016
AND MONTH(TO_DATE(event_time
))= 12
AND http-status
IN ('200','304')
This last query used to run in around 30 minutes and now takes 4 hours to complete.
Anyway we now suppose that the update to ELK5 changed something in the config files of Elasticsearch (namely the use of JVM resources) and we are investigating it. So we are confident that the new ES-Hadoop jar is not responsible for that problem. Again thanks, the Elasticsearch community is really a supportive one!
Tommaso