I created a python script to remove duplicates from my index, in order to compare
#from elasticsearch import Elasticsearch
# Settings
es = Elasticsearch('http://localhost:9000/')
def search_datas():
# Corp de la requête pour rechercher les doublons
search_query = {
"aggs": {
"dup": {
"terms": {
"size": 200000,
"field": "job.keyword",
"min_doc_count": 2
}
}
}
}
# La requête qui va chercher tous les doublons dans l'index 'index-ip'
search_result = es.search(index="v000", body=search_query, doc_type='doc',
filter_path=['aggregations.dup.buckets.key'])
new_datas = []
try:
datas = search_result.get('aggregations').get('dup').get('buckets')
for key in datas:
value = key.get('key')
new_datas.append(value)
return new_datas
except:
pass
def delete_datas(datas):
# Corp de la requête pour effacer tous les doublons
delete_query = {
"query": {
"terms": {"job.keyword": datas}
}
}
if datas != None:
# La requête qui va chercher tous les doublons dans l'index 'index-ip'
es.delete_by_query(index="v000",body=delete_query, doc_type='doc')
else:
print('Pas de doublons')
pass
delete_datas(search_datas())
and my config logsth is
input {
beats {
port => "5044"
}
}
filter {
csv {
separator => ","
columns => [ "chaine", "job", "statut", "serveur", "numero_passage", "application", "sous_application" ]
}
ruby{
path => "C:\Users\h83710\Desktop\elastic\logstash-7.5.2\deduplicate-elaticsearch.py"
}
if [statut] == "EN-ERREUR" and [job] =~ /^MNNATY0P(00|01|02)$/ {
mutate {
add_field => { "statut_globale" => "0" }
}
}
}
output {
stdout {codec => rubydebug}
elasticsearch {
hosts => "http://localhost:9200"
index => "v000"
}
}