Problem with sorting polish words(ES 2.4.6)


(Pawel Malerowicz) #1

Hello.
Wrong sorting result for product name with polish characters.

ElasticSearch Version: 2.4.6, Build: 5376dca/2017-07-18T12:17:44Z, JVM: 1.7.0_151
"version" : {
"number" : "2.4.6",
"build_hash" : "5376dca9f70f3abef96a77f4bb22720ace8240fd",
"build_timestamp" : "2017-07-18T12:17:44Z",
"build_snapshot" : false,
"lucene_version" : "5.5.4"
},

Plugins: Installed plugins in /usr/share/elasticsearch/plugins:
- analysis-stempel
- analysis-phonetic
- analysis-icu

Steps to reproduce
I added 4 rows with names:
Łatwa naz
Latwa naz
Łatwa naa
Latwa naa

I search by "latwa" and sorting by "name" DESC

Expected result
Łatwa naz
Łatwa naa
Latwa naz
Latwa naa

"Ł" should be before "L"

Actual result
Łatwa naz
Latwa naz
Łatwa naa
Latwa naa

Configuration:
"pl_ascii_folding": {
"type": "asciifolding",
"preserve_original": "1"
},

"sortable": {
"type": "custom",
"filter": [
"pl_ascii_folding",
"lowercase"
],
"tokenizer": "keyword"
}
Query:
?q=name:latwa&sort=name.sortable:DESC'

Problem from Problem with sort with polish(POLAND) words! is not the same.

For ElasticSearch version 6.1 is ok.


(Adrien Grand) #2

This behaviour is surprising to me, can you share a full recreation including mappings?


(Pawel Malerowicz) #3
{
  "magento2_default_catalog_product_20180124_102302": {
    "mappings": {
      "product": {
        "_all": {
          "enabled": false
        },
        "properties": {
          "name": {
            "type": "string",
            "norms": {
              "enabled": false
            },
            "fielddata": {
              "format": "disabled"
            },
            "fields": {
              "shingle": {
                "type": "string",
                "norms": {
                  "enabled": false
                },
                "index_options": "docs",
                "fielddata": {
                  "format": "disabled"
                },
                "analyzer": "shingle"
              },
              "sortable": {
                "type": "string",
                "norms": {
                  "enabled": false
                },
                "index_options": "docs",
                "analyzer": "sortable"
              },
              "whitespace": {
                "type": "string",
                "norms": {
                  "enabled": false
                },
                "fielddata": {
                  "format": "disabled"
                },
                "analyzer": "whitespace"
              }
            },
            "copy_to": [
              "search",
              "spelling",
              "autocomplete"
            ],
            "analyzer": "standard"
          }
        }
      }
    }
  }
}
{
  "magento2_default_catalog_product_20180124_102302": {
    "settings": {
      "index": {
        "creation_date": "1516789382129",
        "requests": {
          "cache": {
            "enable": "true"
          }
        },
        "number_of_replicas": "0",
        "translog": {
          "durability": "request",
          "disable_flush": "false"
        },
        "merge": {
          "scheduler": {
            "max_thread_count": "1"
          }
        },
        "uuid": "Pf9-vm2ZRqGoupq-KsUgtg",
        "analysis": {
          "char_filter": {
            "html_strip": {
              "type": "html_strip"
            }
          },
          "filter": {
            "phonetic": {
              "type": "phonetic",
              "encoder": "metaphone"
            },
            "word_delimiter": {
              "split_on_numerics": "1",
              "preserve_original": "1",
              "catenate_all": "1",
              "type": "word_delimiter",
              "catenate_words": "1",
              "catenate_numbers": "1",
              "split_on_case_change": "1",
              "generate_word_parts": "1"
            },
            "lowercase": {
              "type": "lowercase"
            },
            "pl_ascii_folding": {
              "type": "asciifolding",
              "preserve_original": "1"
            },
            "shingle": {
              "type": "shingle",
              "min_shingle_size": "2",
              "max_shingle_size": "2",
              "output_unigrams": "0"
            },
            "trim": {
              "type": "trim"
            },
            "ascii_folding": {
              "type": "asciifolding",
              "preserve_original": "0"
            }
          },
          "analyzer": {
            "whitespace": {
              "type": "custom",
              "char_filter": [
                "html_strip"
              ],
              "filter": [
                "word_delimiter",
                "lowercase",
                "ascii_folding",
                "trim"
              ],
              "tokenizer": "whitespace"
            },
            "shingle": {
              "type": "custom",
              "char_filter": [
                "html_strip"
              ],
              "filter": [
                "word_delimiter",
                "lowercase",
                "ascii_folding",
                "trim",
                "shingle"
              ],
              "tokenizer": "whitespace"
            },
            "phonetic": {
              "type": "custom",
              "char_filter": [
                "html_strip"
              ],
              "filter": [
                "word_delimiter",
                "lowercase",
                "ascii_folding",
                "trim",
                "phonetic"
              ],
              "tokenizer": "whitespace"
            },
            "standard": {
              "type": "custom",
              "char_filter": [
                "html_strip"
              ],
              "filter": [
                "word_delimiter",
                "lowercase",
                "ascii_folding",
                "trim"
              ],
              "tokenizer": "whitespace"
            },
            "sortable": {
              "type": "custom",
              "char_filter": [
                "html_strip"
              ],
              "filter": [
                "lowercase",
                "pl_ascii_folding",
                "trim"
              ],
              "tokenizer": "keyword"
            }
          }
        },
        "number_of_shards": "1",
        "refresh_interval": "1s",
        "version": {
          "created": "2040699"
        }
      }
    }
  }
}

(Pawel Malerowicz) #4

whoever has any idea how to improve it?


(system) #5

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.