Duplicate index with script python

I created a python script to remove duplicates from my index, in order to compare

#from elasticsearch import Elasticsearch

# Settings
es = Elasticsearch('http://localhost:9000/')

def search_datas():
    # Corp de la requête pour rechercher les doublons
    search_query = {
        "aggs": {
            "dup": {
                "terms": {
                    "size": 200000,
                    "field": "job.keyword",
                    "min_doc_count": 2
                }
            }
        }
    }
    # La requête qui va chercher tous les doublons dans l'index 'index-ip'
    search_result = es.search(index="v000", body=search_query, doc_type='doc',
                              filter_path=['aggregations.dup.buckets.key'])
    new_datas = []
    try:
        datas = search_result.get('aggregations').get('dup').get('buckets')
        for key in datas:
            value = key.get('key')
            new_datas.append(value)
        return new_datas
    except:
        pass

def delete_datas(datas):
    # Corp de la requête pour effacer tous les doublons
    delete_query = {
        "query": {
            "terms": {"job.keyword": datas}
        }
    }
    if datas != None:
        # La requête qui va chercher tous les doublons dans l'index 'index-ip'
        es.delete_by_query(index="v000",body=delete_query, doc_type='doc')
    else:
        print('Pas de doublons')
        pass

delete_datas(search_datas())

and my config logsth is

input {
    beats {
    port => "5044"
    }
}                  

filter {
    csv {
        separator => ","
		columns => [ "chaine", "job", "statut", "serveur", "numero_passage", "application", "sous_application" ]
}

ruby{
	path => "C:\Users\h83710\Desktop\elastic\logstash-7.5.2\deduplicate-elaticsearch.py"
	}

if [statut] == "EN-ERREUR" and [job] =~ /^MNNATY0P(00|01|02)$/ {
  mutate {
    add_field  => { "statut_globale" => "0" }
  }
}

}
	
output {
  stdout {codec => rubydebug}
  elasticsearch { 
  hosts => "http://localhost:9200" 
  index => "v000"
  }
}

I have this error :slight_smile:
Failed to execute action {:action=>LogStash::PipelineAction::Create/pipeline_id:main, :exception=>"Java::JavaLang::IllegalStateException", :message=>"Unable to configure plugins: (SyntaxError) C:\\Users\\h83710\\Desktop\\elastic\\logstash-7.5.2\\deduplicate-elaticsearch.py:6: syntax error, unexpected ':'\ndef search_datas():\n ^", :backtrace=>["org.logstash.config.ir.CompiledPipeline.<init>(CompiledPipeline.java:119)", "org.logstash.execution.JavaBasePipelineExt.initialize(JavaBasePipelineExt.java:60)", "org.logstash.execution.JavaBasePipelineExt$INVOKER$i$1$0$initialize.call(JavaBasePipelineExt$INVOKER$i$1$0$initialize.gen)", "org.jruby.internal.runtime.methods.JavaMethod$JavaMethodN.call(JavaMethod.java:837)", "org.jruby.ir.runtime.IRRuntimeHelpers.instanceSuper(IRRuntimeHelpers.java:1156)", "org.jruby.ir.runtime.IRRuntimeHelpers.instanceSuperSplatArgs(IRRuntimeHelpers.java:1143)", "org.jruby.ir.targets.InstanceSuperInvokeSite.invoke(InstanceSuperInvokeSite.java:39)", "C_3a_.Users.h83710.Desktop.elastic.logstash_minus_7_dot_5_dot_2.logstash_minus_core.lib.logstash.java_pipeline.RUBY$method$initialize$0(C:/Users/h83710/Desktop/elastic/logstash-7.5.2/logstash-core/lib/logstash/java_pipeline.rb:27)", "org.jruby.internal.runtime.methods.CompiledIRMethod.call(CompiledIRMethod.java:91)", "org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:90)", "org.jruby.runtime.callsite.CachingCallSite.cacheAndCall(CachingCallSite.java:332)", "org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:86)", "org.jruby.RubyClass.newInstance(RubyClass.java:915)", "o

The ruby filter expects ruby, not python - https://www.elastic.co/guide/en/logstash/current/plugins-filters-ruby.html

It's not clear why you are running this in Logstash though?

If you want to avoid duplicates, why not prevent them when indexing new data? You could use the id you are deduplicating on as a document id and not have to do this at all.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.