I want to search similar sentences using dense vectors calculated by BERT.
I am following this great paper in french or english
I indexed sentences with each having a dense_vector calculated by BERT.
I ran this query with success in KIbana giving wonderful results (I shortened the dense_vector values for this post) :
GET webmonitor_phrases/_search
{
"size" : 5,
"query": {
"script_score": {
"query" : {
"match_all" : {}
},
"script": {
"source": "cosineSimilarity(params.queryVector, 'Phrase_vector') + 1.0",
"params": {
"queryVector": [-0.01927643,-0.018217519,-0.021058332,0.027151054,0.05784438,-0.008460273,-0.027456086,0.042760707,-0.08016594,-0.017899536,-0.041639842,0.02960743,-0.01679815,0.1015308,0.05703567,0.0069785477,0.009718613,0.008082357,-0.012589352,0.12446663,0.018861143,0.053296693,-0.03215604,0.010972261,-0.10968127,0.012995018,0.07786082,-0.017690469,0.03248199,-0.04271781,-0.06806753,-0.07297233,0.0035993687,0.011882267,0.016708337,-0.019617425,-0.047242224,0.014557032,0.04139613,-0.008320617,-0.04017011,-0.03703198,0.022198584,-0.021771815,0.02410845,0.016920311,-0.02317638,0.004767835,0.023395542,0.007503315,0.025918411,-0.016145818,-0.06478562,-0.08040128,0.002344442,-0.008921131,0.047266897,0.047718693,-0.0011429694,0.05584679,-0.024764488,0.0192903,0.024397915,-0.045773678,-0.024413446,0.021329394,-0.0469761,0.063250914,-0.046951707,0.0404466,-0.065411255,-0.038457282,0.029206796,-0.01588464,0.04883571,0.026628332,0.05048967,-0.012432704,-0.024493853,0.003305153,0.0039466517,-0.04878099,-0.033499695,0.040233597,0.01040456,0.0076247444,-0.052615903,0.036758807,-0.0438332,-0.015946992,-0.08943201,0.014444613,-0.0019423354,-0.020825846,-0.022945268,0.0867511,-0.09420672,0.011214681,-0.06201148,0.10487903,-0.015105568,-0.00086394855,0.04614532,0.05401257]
}
}
}
}
}
I am facing an issue when trying to do exactly the same thing using Java REST client.
Following this, I coded the following snippet :
// construction de la request
request = new SearchTemplateRequest();
request.setRequest(searchRequest);
request.setScriptType(ScriptType.INLINE);
// " \"source\": \"cosineSimilarity([{{value_vector}}], doc['{{interro_vector}}']) + 1.0\" " +
// " \"source\": \"cosineSimilarity([" + dense_vector + "], doc['" + enrg.getInterrogation() + "']) + 1.0\" " +
script =
"{" +
" \"script_score\": { " +
" \"query\": { " +
" \"query_string\" : { " +
" \"query\" : \"" + query + "\", " +
" \"default_field\" : \"tout\" " +
" } " +
" }, " +
" \"script\": { " +
" \"source\": \"cosineSimilarity([" + dense_vector + "], '" + enrg.getInterrogation() + "' ) + 1.0\" " +
" } " +
" } " +
"}";
request.setScript(script);
Map<String, Object> scriptParams = new HashMap<>();
//scriptParams.put("interro_vector", enrg.getInterrogation());
//scriptParams.put("value_vector", dense_vector );
request.setScriptParams(scriptParams);
I get this in the logs (I shortened the dense_vector values for this post):
|2020/06/26 13:17:37 075-qeswebmonitor-demoqwam-TRACE-QESHost_elastic62-buildRequestForDenseVector()|: Fin:
{
"script_score": {
"query": {
"query_string" : {
"query" : "*:*",
"default_field" : "tout"
}
},
"script": {
"source":
"cosineSimilarity([-0.01927643,-0.018217519,-0.021058332,0.027151054,0.05784438,-0.008460273,-0.027456086,0.042760707,-0.08016594,-0.017899536,-0.041639842,0.02960743,-0.01679815,0.1015308,0.05703567,0.0069785477,0.009718613,0.008082357,-0.012589352,0.12446663,0.018861143,0.053296693,-0.03215604,0.010972261,-0.10968127,0.012995018,0.07786082,-0.017690469,0.03248199,-0.04271781,-0.06806753,-0.07297233,0.0035993687,0.011882267,0.016708337,-0.019617425,-0.047242224,0.014557032,0.04139613,-0.008320617,-0.04017011,-0.03703198,0.022198584,-0.021771815,0.02410845,0.016920311,-0.02317638,0.004767835,0.023395542,0.007503315,0.025918411,-0.016145818,-0.06478562,-0.08040128,0.002344442,-0.008921131,0.047266897,0.047718693,-0.0011429694,0.05584679,-0.024764488,0.0192903,0.024397915,-0.045773678,-0.024413446,0.021329394,-0.0469761,0.063250914,-0.046951707,0.0404466,-0.065411255,-0.038457282,0.029206796,-0.01588464,0.04883571,0.026628332,0.05048967,-0.012432704,-0.024493853,0.003305153,0.0039466517,-0.04878099,-0.033499695,0.040233597,0.01040456,0.0076247444,-0.052615903,0.036758807,-0.0438332,-0.015946992,-0.08943201,0.014444613,-0.0019423354,-0.020825846,-0.022945268,0.0867511,-0.09420672,0.011214681,-0.06201148,0.10487903,-0.015105568,-0.00086394855,0.04614532,0.05401257], 'Phrase_vector' ) + 1.0"
}
}
}|
|2020/06/26 13:17:37 076-qeswebmonitor-demoqwam-TRACE-QESHost_elastic62-getNombreReference()|: search template avant:SearchRequest{searchType=DFS_QUERY_THEN_FETCH, indices=[webmonitor_phrases], indicesOptions=IndicesOptions[ignore_unavailable=false, allow_no_indices=true, expand_wildcards_open=true, expand_wildcards_closed=false, allow_aliases_to_multiple_indices=true, forbid_closed_indices=true, ignore_aliases=false, ignore_throttled=true], types=[], routing='null', preference='null', requestCache=null, scroll=null, maxConcurrentShardRequests=0, batchedReduceSize=512, preFilterShardSize=128, allowPartialSearchResults=null, localClusterAlias=null, getOrCreateAbsoluteStartMillis=-1, ccsMinimizeRoundtrips=true, source={"from":0,"size":20,"explain":false,"track_total_hits":2147483647,"aggregations":{"FOLDER_ID.verbatim":{"terms":{"field":"FOLDER_ID.verbatim","size":10,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"SOURCE.verbatim":{"terms":{"field":"SOURCE.verbatim","size":10,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"DATE_COLLECT":{"terms":{"field":"DATE_COLLECT","size":10,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":{"_key":"desc"}}},"QES_Person.verbatim":{"terms":{"field":"QES_Person.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"QES_Company.verbatim":{"terms":{"field":"QES_Company.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"QES_Location.verbatim":{"terms":{"field":"QES_Location.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"QES_Event.verbatim":{"terms":{"field":"QES_Event.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"QES_Concept.verbatim":{"terms":{"field":"QES_Concept.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}},"QES_Organization.verbatim":{"terms":{"field":"QES_Organization.verbatim","size":20,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]}}}}}||
|2020/06/26 13:17:37 085-qeswebmonitor-demoqwam-ERROR-QESHost_elastic62-getNombreReference()|: Recuperation nombre reference impossible:Elasticsearch exception [type=parsing_exception, reason=Unknown key for a START_OBJECT in [script_score].]||
The request looks pretty much the same.
Cannot figure out what is going wrong and why I am getting this message:
Elasticsearch exception [type=parsing_exception, reason=Unknown key for a START_OBJECT in [script_score].]
Can you help me ?