ES query from spark returns all despite of filter

Hello All,

I am trying to access index from databricks via es query and wanted a result set based on query filter, add to data frame.
Requirement: Get output with :cd.ID=1 and cd.s_D<>0.
I get: All the cd.ID both 0 and one.
When I run this in elastic console , I get exact output where as with DF I get all . Any help would be appreciated.

query="""
{
  "size" : 1000,
  "query" : {
    "bool" : {
      "must" : [
        {
          "bool" : {
            "must" : [
              {
                "term" : {
                  "PtK" : {
                    "value" : "123",
                    "boost" : 1.0
                  }
                }
              },
              {
                "nested" : {
                  "query" : {
                    "term" : {
                      "cd.ID" : {
                        "value" : 1,
                        "boost" : 1.0
                      }
                    }
                  },
                  "path" : "cd",
                  "ignore_unmapped" : false,
                  "score_mode" : "none",
                  "boost" : 1.0,
                  "inner_hits" : {
                    "name" : "cd_61",
                    "ignore_unmapped" : false,
                    "from" : 0,
                    "size" : 99,
                    "version" : false,
                    "seq_no_primary_term" : false,
                    "explain" : false,
                    "track_scores" : false,
                    "_source" : false,
                    "stored_fields" : "_none_"
                    
                  }
                }
              }
            ],
            "boost" : 1.0
          }
        },
        {
          "nested" : {
            "query" : {
              "bool" : {
                "must_not" : [
                  {
                    "term" : {
                      "cd.s_D" : {
                        "value" : 0,
                        "boost" : 1.0
                      }
                    }
                  }
                ],
                "boost" : 1.0
              }
            },
            "path" : "cd",
            "ignore_unmapped" : false,
            "score_mode" : "none",
            "boost" : 1.0,
            "inner_hits" : {
              "name" : "cd_62",
              "ignore_unmapped" : false,
              "from" : 0,
              "size" : 99,
              "version" : false,
              "seq_no_primary_term" : false,
              "explain" : false,
              "track_scores" : false,
              "_source" : false,
              "stored_fields" : "_none_",
              "fields" : [
                {
                  "field" : "cd.s_D"
                }
              ]
            }
          }
        }
      ],
      "boost" : 1.0
    }
  },
  "_source" : false,
  "fields" : [
    {
      "field" : "PtK"
    }
  ]
}

"""


reader = spark.read \
.format("org.elasticsearch.spark.sql") \
.option("es.read.metadata","true") \
.option("es.nodes.wan.only","true") \
.option("es.port","XXX") \
.option("es.net.ssl","true") \
.option("es.net.http.auth.user",user) \
.option("es.net.http.auth.pass",pw) \
.option("es.nodes", url) \
.option("es.query", query)   

df = reader.load("myindex")
df.createOrReplaceTempView('Finalresult')

Hi @aadhikari3711. I would expect that to work. Can you post your mapping?

Hi @Keith_Massey ,
Thank you for responding.
Please find the mapping below. It's with masking the actual field.
In elastic console I get exact result, but when querying through databricks like mentioned above, I get ones that qualifies the condition and one does not.

"mappings": {
        "dynamic": "false", 
        "properties": { 
            "PtK": { "type": "keyword" },
            "cd": {"type": "nested",
                "properties": {                
                "ID": {"type": "keyword"},
                "S_D": {"type": "keyword"},
                "Field2": {"type": "keyword"},
                "Field3": {"type": "keyword"},
                "Field4":{"type": "keyword"},
                "Field6":{"type": "keyword"},                
                "Field7": {"type": "date"}}},
            "th": {"type": "nested",
                "properties": {                
				  "th1": { "type": "keyword"},
				  "th2": {"type": "keyword"},
				  "th3": {"type": "date"},
				  "th4": {"type": "keyword"},              
				  "th5": {"type": "keyword"},              
				  "th6": {"type": "date"}}},   
            "pr": {"type": "nested",
                "properties": {                
                "pr1": { "type": "keyword"},
                "pr2":{ "type": "keyword"},
                "pr3":{ "type": "keyword"},
                "pr5":{ "type": "keyword"},                
                "pr6": {"type": "keyword"}}},
            "cob": {"type": "nested",
                "properties": {                
                "cob1": {"type": "keyword"},
                "cob2": {"type": "keyword"},
                "cob3": {"type": "keyword"},
                "cob4": {"type": "keyword"},
                "cob5":{"type": "keyword"},
                "cb6": {"type": "keyword"}}}
                }
         }

Actually could you also post the data to reproduce this? I tried creating some test data but I'm not sure if it's the same as what you're using. It doesn't have to be the full mapping or the full data -- just enough to reproduce the problem. Ideally you could post commands to create the index with mappings and commands to insert however many documents it takes to reproduce the problem with the spark code you have posted. Thanks.

@Keith_Massey
Please find the sample data below :

Ptk1	ID	S_D
1	1	202
	2	0
	1	202
	1	217
	1	700
	2	0
	3	0
	4	0
	5	0
	6	0


Can you post a curl command to create the documents? For example your query requires Ptk to be 123 right? The easier you make it for someone to reproduce this, the more likely someone will be able to help.

@Keith_Massey

Please find the json with data

POST /testindex/
  {
          
          "Ptk" : 1,
          "cd" : [
            {
              "ID" : 1,
              "S_D" : 202
            },
            {
              "ID" : 1,
              "S_D" : 217
            },
            {
              "ID" : 2,
              "S_D" : 0
            },
            {
              "ID" : 1,
              "S_D" : 700
            },
            {
              "ID" : 3,
              "S_D" : 0
            },
            {
              "ID" : 4,
              "S_D" : 0
            },
            {
              "ID" : 5,
              "S_D" : 0
            },
            {
              "ID" : 6,
              "S_D" : 0
            }
          ]
      
  }

I get no hits from your query going through the elasticsearch API (I believe because your data does not have Ptk set to 123 in any documents). Are you sure that you are using the same query through the Elasticsearch API and through the es-spark API?

Create the index with mapping:

curl -XPUT "http://localhost:9200/testindex" -H 'Content-Type: application/json' -d'
{
  "mappings": {
    "dynamic": "false",
    "properties": {
      "PtK": {
        "type": "keyword"
      },
      "cd": {
        "type": "nested",
        "properties": {
          "ID": {
            "type": "keyword"
          },
          "S_D": {
            "type": "keyword"
          },
          "Field2": {
            "type": "keyword"
          },
          "Field3": {
            "type": "keyword"
          },
          "Field4": {
            "type": "keyword"
          },
          "Field6": {
            "type": "keyword"
          },
          "Field7": {
            "type": "date"
          }
        }
      },
      "th": {
        "type": "nested",
        "properties": {
          "th1": {
            "type": "keyword"
          },
          "th2": {
            "type": "keyword"
          },
          "th3": {
            "type": "date"
          },
          "th4": {
            "type": "keyword"
          },
          "th5": {
            "type": "keyword"
          },
          "th6": {
            "type": "date"
          }
        }
      },
      "pr": {
        "type": "nested",
        "properties": {
          "pr1": {
            "type": "keyword"
          },
          "pr2": {
            "type": "keyword"
          },
          "pr3": {
            "type": "keyword"
          },
          "pr5": {
            "type": "keyword"
          },
          "pr6": {
            "type": "keyword"
          }
        }
      },
      "cob": {
        "type": "nested",
        "properties": {
          "cob1": {
            "type": "keyword"
          },
          "cob2": {
            "type": "keyword"
          },
          "cob3": {
            "type": "keyword"
          },
          "cob4": {
            "type": "keyword"
          },
          "cob5": {
            "type": "keyword"
          },
          "cb6": {
            "type": "keyword"
          }
        }
      }
    }
  }
}'

Insert a single document:

curl -XPUT "http://localhost:9200/testindex/_doc/1" -H 'Content-Type: application/json' -d'
{
  "Ptk": 1,
  "cd": [
    {
      "ID": 1,
      "S_D": 202
    },
    {
      "ID": 1,
      "S_D": 217
    },
    {
      "ID": 2,
      "S_D": 0
    },
    {
      "ID": 1,
      "S_D": 700
    },
    {
      "ID": 3,
      "S_D": 0
    },
    {
      "ID": 4,
      "S_D": 0
    },
    {
      "ID": 5,
      "S_D": 0
    },
    {
      "ID": 6,
      "S_D": 0
    }
  ]
}'

And then the query that returns 0 hits:

curl -XGET "http://localhost:9200/testindex/_search" -H 'Content-Type: application/json' -d'
{
  "size": 1000,
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "must": [
              {
                "term": {
                  "PtK": {
                    "value": "123",
                    "boost": 1
                  }
                }
              },
              {
                "nested": {
                  "query": {
                    "term": {
                      "cd.ID": {
                        "value": 1,
                        "boost": 1
                      }
                    }
                  },
                  "path": "cd",
                  "ignore_unmapped": false,
                  "score_mode": "none",
                  "boost": 1,
                  "inner_hits": {
                    "name": "cd_61",
                    "ignore_unmapped": false,
                    "from": 0,
                    "size": 99,
                    "version": false,
                    "seq_no_primary_term": false,
                    "explain": false,
                    "track_scores": false,
                    "_source": false,
                    "stored_fields": "_none_"
                  }
                }
              }
            ],
            "boost": 1
          }
        },
        {
          "nested": {
            "query": {
              "bool": {
                "must_not": [
                  {
                    "term": {
                      "cd.s_D": {
                        "value": 0,
                        "boost": 1
                      }
                    }
                  }
                ],
                "boost": 1
              }
            },
            "path": "cd",
            "ignore_unmapped": false,
            "score_mode": "none",
            "boost": 1,
            "inner_hits": {
              "name": "cd_62",
              "ignore_unmapped": false,
              "from": 0,
              "size": 99,
              "version": false,
              "seq_no_primary_term": false,
              "explain": false,
              "track_scores": false,
              "_source": false,
              "stored_fields": "_none_",
              "fields": [
                {
                  "field": "cd.s_D"
                }
              ]
            }
          }
        }
      ],
      "boost": 1
    }
  },
  "_source": false,
  "fields": [
    {
      "field": "PtK"
    }
  ]
}'

@Keith_Massey

I use this query and it works.

{
  "size" : 1000,
  "query" : {
    "bool" : {
      "must" : [
        {
          "nested" : {
            "query" : {
              "term" : {
                "cd.ID" : {
                  "value" : 1,
                  "boost" : 1.0
                }
              }
            },
            "path" : "cd",
            "ignore_unmapped" : false,
            "score_mode" : "none",
            "boost" : 1.0,
            "inner_hits" : {
              "name" : "cd_141",
              "ignore_unmapped" : false,
              "from" : 0,
              "size" : 99,
              "version" : false,
              "seq_no_primary_term" : false,
              "explain" : false,
              "track_scores" : false,
              "_source" : false,
              "stored_fields" : "_none_",
              "fields" : [
                {
                  "field" : "cd.S_D"
                },
                {
                  "field" : "cd.ID"
                }
              ]
            }
          }
        },
        {
          "nested" : {
            "query" : {
              "bool" : {
                "must_not" : [
                  {
                    "term" : {
                      "cd.S_D" : {
                        "value" : 0,
                        "boost" : 1.0
                      }
                    }
                  }
                ],
                "boost" : 1.0
              }
            },
            "path" : "cd",
            "ignore_unmapped" : false,
            "score_mode" : "none",
            "boost" : 1.0,
            "inner_hits" : {
              "name" : "cd_142",
              "ignore_unmapped" : false,
              "from" : 0,
              "size" : 99,
              "version" : false,
              "seq_no_primary_term" : false,
              "explain" : false,
              "track_scores" : false,
              "_source" : false,
              "stored_fields" : "_none_",
              "fields" : [
                {
                  "field" : "cd.S_D"
                },
                {
                  "field" : "cd.ID"
                }
              ]
            }
          }
        }
      ],
      "boost" : 1.0
    }
  },
  "_source" : false,
  "fields" : [
    {
      "field" : "PtK"
    }
  ],
  "sort" : [
    {
      "_doc" : {
        "order" : "asc"
      }
    }
  ]
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.