Different Length Strings with Same FieldNorm

I'm having an issue when using an nGram filter with the fieldNorm of two
fields with different length strings coming back with the same fieldNorm.
I've written a base simple test case for review which you can see below. I
just can't seem to figure out why "ktm monster graphics" is getting the
same score as "monster graphics". Can anyone shed any light on this?

echo "Deleting old index... ";

curl -XDELETE 'localhost:9200/jtest/';

echo -e "\r";

echo "Creating new index... ";

curl -XPUT 'http://localhost:9200/jtest/' -d '{
"settings" : {
"index" : {
"number_of_shards" : "1",
"number_of_replicas" : 0
},
"analysis" : {
"analyzer" : {
"iVal" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase", "partial"]
},
"sVal" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase"]
}
},
"filter" : {
"partial" : {
"type" : "edgeNGram",
"min_gram" : "2",
"max_gram" : "20"
}
}
}
},
"mappings" : {
"test_type" : {
"properties" : {
"test_value" : {
"type" : "string",
"store" : true,
"index" : "analyzed",
"index_analyzer" : "iVal",
"search_analyzer" : "sVal"
}
}
}
}
}';

echo -e "\r";

echo "Inserting Data...";

curl -XPUT 'http://localhost:9200/jtest/test_type/2' -d '{
"test_value" : "ktm monster graphics"
}';

echo -e "\r";

curl -XPUT 'http://localhost:9200/jtest/test_type/1' -d '{
"test_value" : "monster graphics"
}';

echo -e "\r";

sleep 5

echo "Performing Search Query...";

curl -XGET 'http://localhost:9200/jtest/test_type/_search?pretty=1' -d '{
"query" : {
"text" : {
"test_value" : "monster graphics"
}
}
}';

echo -e "\r";

Just in case anyone wants the results of that last query with an explain.

{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.10509991,
"hits" : [ {
"_shard" : 0,
"_node" : "nPbYBu5UQquwUEVVht03CA",
"_index" : "jtest",
"_type" : "test_type",
"_id" : "2",
"_score" : 0.10509991, "_source" : {
"test_value" : "ktm monster graphics"
},
"_explanation" : {
"value" : 0.10509991,
"description" : "sum of:",
"details" : [ {
"value" : 0.052549955,
"description" : "weight(test_value:monster in 0), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:monster), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:monster in 0), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:monster)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=0)"
} ]
} ]
}, {
"value" : 0.052549955,
"description" : "weight(test_value:graphics in 0), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:graphics), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:graphics in 0), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:graphics)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=0)"
} ]
} ]
} ]
}
}, {
"_shard" : 0,
"_node" : "nPbYBu5UQquwUEVVht03CA",
"_index" : "jtest",
"_type" : "test_type",
"_id" : "1",
"_score" : 0.10509991, "_source" : {
"test_value" : "monster graphics"
},
"_explanation" : {
"value" : 0.10509991,
"description" : "sum of:",
"details" : [ {
"value" : 0.052549955,
"description" : "weight(test_value:monster in 1), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:monster), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:monster in 1), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:monster)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=1)"
} ]
} ]
}, {
"value" : 0.052549955,
"description" : "weight(test_value:graphics in 1), product of:",
"details" : [ {
"value" : 0.70710677,
"description" : "queryWeight(test_value:graphics), product of:",
"details" : [ {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 1.1893445,
"description" : "queryNorm"
} ]
}, {
"value" : 0.07431686,
"description" : "fieldWeight(test_value:graphics in 1), product
of:",
"details" : [ {
"value" : 1.0,
"description" : "tf(termFreq(test_value:graphics)=1)"
}, {
"value" : 0.5945349,
"description" : "idf(docFreq=2, maxDocs=2)"
}, {
"value" : 0.125,
"description" : "fieldNorm(field=test_value, doc=1)"
} ]
} ]
} ]
}
} ]
}
}

On Thu, 2011-12-22 at 10:53 -0800, James Bathgate wrote:

I'm having an issue when using an nGram filter with the fieldNorm of
two fields with different length strings coming back with the same
fieldNorm. I've written a base simple test case for review which you
can see below. I just can't seem to figure out why "ktm monster
graphics" is getting the same score as "monster graphics". Can anyone
shed any light on this?

Field norms are stored in 8 bits, so small differences in field length,
when rounded down to 8 bits, are considered to be equal.

If you make the longer string just a bit longer (eg 'ktmxy monster
graphics') then you see a difference:

"hits" : {
"total" : 2,
"max_score" : 0.21019982,
"hits" : [ {
"_index" : "jtest",
"_type" : "test_type",
"_id" : "1",
"_score" : 0.21019982, "_source" : {
"test_value" : "monster graphics"
}
}, {
"_index" : "jtest",
"_type" : "test_type",
"_id" : "2",
"_score" : 0.18392484, "_source" : {
"test_value" : "ktmxy monster graphics"
}
} ]
}

clint