Different Length Strings with Same FieldNorm


(James Bathgate) #1

I'm having an issue when using an nGram filter with the fieldNorm of two
fields with different length strings coming back with the same fieldNorm.
I've written a base simple test case for review which you can see below. I
just can't seem to figure out why "ktm monster graphics" is getting the
same score as "monster graphics". Can anyone shed any light on this?

echo "Deleting old index... ";

curl -XDELETE 'localhost:9200/jtest/';

echo -e "\r";

echo "Creating new index... ";

curl -XPUT 'http://localhost:9200/jtest/' -d '{
"settings" : {
"index" : {
"number_of_shards" : "1",
"number_of_replicas" : 0
},
"analysis" : {
"analyzer" : {
"iVal" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase", "partial"]
},
"sVal" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase"]
}
},
"filter" : {
"partial" : {
"type" : "edgeNGram",
"min_gram" : "2",
"max_gram" : "20"
}
}
}
},
"mappings" : {
"test_type" : {
"properties" : {
"test_value" : {
"type" : "string",
"store" : true,
"index" : "analyzed",
"index_analyzer" : "iVal",
"search_analyzer" : "sVal"
}
}
}
}
}';

echo -e "\r";

echo "Inserting Data...";

curl -XPUT 'http://localhost:9200/jtest/test_type/2' -d '{
"test_value" : "ktm monster graphics"
}';

echo -e "\r";

curl -XPUT 'http://localhost:9200/jtest/test_type/1' -d '{
"test_value" : "monster graphics"
}';

echo -e "\r";

sleep 5

echo "Performing Search Query...";

curl -XGET 'http://localhost:9200/jtest/test_type/_search?pretty=1' -d '{
"query" : {
"text" : {
"test_value" : "monster graphics"
}
}
}';

echo -e "\r";


(James Bathgate) #2

Just in case anyone wants the results of that last query with an explain.

{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.10509991,
"hits" : [ {
"_shard" : 0,
"_node" : "nPbYBu5UQquwUEVVht03CA",
"_index" : "jtest",
"_type" : "test_type",
"_id" : "2",
"_score" : 0.10509991, "_source" : {
"test_value" : "ktm monster graphics"
},
"_explanation" : {
"value" : 0.10509991,
"description" : "sum of:",
"details" : [ {
"value" : 0.052549955,
"description" : "weight(test_value:monster in 0), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:monster), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:monster in 0), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:monster)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=0)"
} ]
} ]
}, {
"value" : 0.052549955,
"description" : "weight(test_value:graphics in 0), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:graphics), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:graphics in 0), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:graphics)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=0)"
} ]
} ]
} ]
}
}, {
"_shard" : 0,
"_node" : "nPbYBu5UQquwUEVVht03CA",
"_index" : "jtest",
"_type" : "test_type",
"_id" : "1",
"_score" : 0.10509991, "_source" : {
"test_value" : "monster graphics"
},
"_explanation" : {
"value" : 0.10509991,
"description" : "sum of:",
"details" : [ {
"value" : 0.052549955,
"description" : "weight(test_value:monster in 1), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:monster), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:monster in 1), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:monster)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=1)"
} ]
} ]
}, {
"value" : 0.052549955,
"description" : "weight(test_value:graphics in 1), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:graphics), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:graphics in 1), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:graphics)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=1)"
} ]
} ]
} ]
}
} ]
}
}


(Clinton Gormley) #3

On Thu, 2011-12-22 at 10:53 -0800, James Bathgate wrote:

I'm having an issue when using an nGram filter with the fieldNorm of
two fields with different length strings coming back with the same
fieldNorm. I've written a base simple test case for review which you
can see below. I just can't seem to figure out why "ktm monster
graphics" is getting the same score as "monster graphics". Can anyone
shed any light on this?

Field norms are stored in 8 bits, so small differences in field length,
when rounded down to 8 bits, are considered to be equal.

If you make the longer string just a bit longer (eg 'ktmxy monster
graphics') then you see a difference:

"hits" : {
"total" : 2,
"max_score" : 0.21019982,
"hits" : [ {
"_index" : "jtest",
"_type" : "test_type",
"_id" : "1",
"_score" : 0.21019982, "_source" : {
"test_value" : "monster graphics"
}
}, {
"_index" : "jtest",
"_type" : "test_type",
"_id" : "2",
"_score" : 0.18392484, "_source" : {
"test_value" : "ktmxy monster graphics"
}
} ]
}

clint


(system) #4