Search with Synonyms do not to work

Hi, with great assistence I was able to add own synonym file to my search.

After updating the template I delete the index and use -restart to initial (as David mentioned).

The settings now looking like this:

{
  "component_templates": [
    {
      "name": "fscrawler_settings_total_fields",
      "component_template": {
        "template": {
          "settings": {
            "index": {
              "mapping": {
                "total_fields": {
                  "limit": "2000"
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_file",
      "component_template": {
        "template": {
          "mappings": {
            "properties": {
              "file": {
                "properties": {
                  "extension": {
                    "type": "keyword"
                  },
                  "filename": {
                    "store": true,
                    "type": "keyword"
                  },
                  "indexed_chars": {
                    "type": "long"
                  },
                  "last_accessed": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "content_type": {
                    "type": "keyword"
                  },
                  "indexing_date": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "created": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "checksum": {
                    "type": "keyword"
                  },
                  "filesize": {
                    "type": "long"
                  },
                  "last_modified": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "url": {
                    "index": false,
                    "type": "keyword"
                  }
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_attributes",
      "component_template": {
        "template": {
          "mappings": {
            "properties": {
              "attributes": {
                "properties": {
                  "owner": {
                    "type": "keyword"
                  },
                  "group": {
                    "type": "keyword"
                  }
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_path",
      "component_template": {
        "template": {
          "settings": {
            "index": {
              "analysis": {
                "analyzer": {
                  "fscrawler_path": {
                    "tokenizer": "fscrawler_path"
                  }
                },
                "tokenizer": {
                  "fscrawler_path": {
                    "type": "path_hierarchy"
                  }
                }
              }
            }
          },
          "mappings": {
            "properties": {
              "path": {
                "properties": {
                  "virtual": {
                    "type": "keyword",
                    "fields": {
                      "tree": {
                        "fielddata": true,
                        "analyzer": "fscrawler_path",
                        "type": "text"
                      },
                      "fulltext": {
                        "type": "text"
                      }
                    }
                  },
                  "root": {
                    "type": "keyword"
                  },
                  "real": {
                    "type": "keyword",
                    "fields": {
                      "tree": {
                        "fielddata": true,
                        "analyzer": "fscrawler_path",
                        "type": "text"
                      },
                      "fulltext": {
                        "type": "text"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_alias",
      "component_template": {
        "template": {
          "aliases": {
            "fscrawler": {}
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_meta",
      "component_template": {
        "template": {
          "mappings": {
            "dynamic_templates": [
              {
                "raw_as_text": {
                  "path_match": "meta.raw.*",
                  "mapping": {
                    "type": "text",
                    "fields": {
                      "keyword": {
                        "ignore_above": 256,
                        "type": "keyword"
                      }
                    }
                  }
                }
              }
            ],
            "properties": {
              "meta": {
                "properties": {
                  "date": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "altitude": {
                    "type": "text"
                  },
                  "keywords": {
                    "type": "text"
                  },
                  "modifier": {
                    "type": "text"
                  },
                  "latitude": {
                    "type": "text"
                  },
                  "rating": {
                    "type": "byte"
                  },
                  "description": {
                    "type": "text"
                  },
                  "language": {
                    "type": "keyword"
                  },
                  "source": {
                    "type": "text"
                  },
                  "title": {
                    "type": "text"
                  },
                  "creator_tool": {
                    "type": "keyword"
                  },
                  "type": {
                    "type": "text"
                  },
                  "relation": {
                    "type": "text"
                  },
                  "contributor": {
                    "type": "text"
                  },
                  "rights": {
                    "type": "text"
                  },
                  "metadata_date": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "longitude": {
                    "type": "text"
                  },
                  "coverage": {
                    "type": "text"
                  },
                  "identifier": {
                    "type": "text"
                  },
                  "comments": {
                    "type": "text"
                  },
                  "author": {
                    "type": "text"
                  },
                  "created": {
                    "format": "date_optional_time",
                    "type": "date"
                  },
                  "format": {
                    "type": "text"
                  },
                  "publisher": {
                    "type": "text"
                  },
                  "print_date": {
                    "format": "date_optional_time",
                    "type": "date"
                  }
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_settings_shards",
      "component_template": {
        "template": {
          "settings": {
            "index": {
              "number_of_shards": "1"
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_content",
      "component_template": {
        "template": {
          "settings": {
            "index": {
              "analysis": {
                "filter": {
                  "my_synonym_filter": {
                    "type": "synonym",
                    "synonyms_path": "synonyms.txt",
                    "updateable": "true"
                  }
                },
                "analyzer": {
                  "my_synonym_analyzer": {
                    "filter": [
                      "lowercase",
                      "my_synonym_filter"
                    ],
                    "tokenizer": "standard"
                  }
                }
              }
            }
          }
        }
      }
    },
    {
      "name": "fscrawler_mapping_attachment",
      "component_template": {
        "template": {
          "mappings": {
            "properties": {
              "attachment": {
                "type": "binary",
                "doc_values": false
              }
            }
          }
        }
      }
    }
  ]
}

The file synonyms.txt is stored at ..\elasticsearch\config\synonyms.txt and contains this sample:

aws, amazon web service

But it has no effect.
When I search aws I get:

EC2, **AWS** EFS, **AWS** FSx, **AWS** ALB, **AWS** NLB, **AWS** Systems Manager, **AWS** Secrets Manager, **AWS** Parameter Store….,

But for my understanding should searching for amazon web service return at least something similar but it's complete different scoring:

**web** services, and APIs using **Web** API. )….Proficient in **Web** Services, SOAP, XML, CSS, HTML5, AJAX and JavaScript.….Utilized contemporary design to create concise web sites for specific client needs.….Services (Facebook, Gmaps, Bing Maps, YouTube, Flickr, Panoramio, **Amazon** S3), etc.

Did I miss something or what can be the reason?

Thanks, Andre

First, you need to test the my_synonym_analyzer analyzer to check how it behaves.

This can be done by running the _analyze API:

GET /fscrawler/_analyze
{
  "analyzer": "my_synonym_analyzer",
  "text": "AWS"
}
GET /fscrawler/_analyze
{
  "analyzer": "my_synonym_analyzer",
  "text": "Amazon Web Service"
}

What are the outputs?

But anyway, the component template is “wrong”:

    {
      "name": "fscrawler_mapping_content",
      "component_template": {
        "template": {
          "settings": {
            "index": {
              "analysis": {
                "filter": {
                  "my_synonym_filter": {
                    "type": "synonym",
                    "synonyms_path": "synonyms.txt",
                    "updateable": "true"
                  }
                },
                "analyzer": {
                  "my_synonym_analyzer": {
                    "filter": [
                      "lowercase",
                      "my_synonym_filter"
                    ],
                    "tokenizer": "standard"
                  }
                }
              }
            }
          }
        }
      }
    }

It should have this part which seems to be missing:

  "mappings": {
    "properties": {
      "content": {
        "type": "text",
        "analyzer": "my_synonym_analyzer"
      }
      }
    }
  }

Check what the final mapping is with:

GET fscrawler/_mapping

Hi again,

the output looks good for my understanding:

{
  "tokens": [
    {
      "token": "aws",
      "start_offset": 0,
      "end_offset": 3,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "amazon",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 0
    },
    {
      "token": "web",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 1
    },
    {
      "token": "service",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 2
    }
  ]
}

{
  "tokens": [
    {
      "token": "amazon",
      "start_offset": 0,
      "end_offset": 6,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "aws",
      "start_offset": 0,
      "end_offset": 18,
      "type": "SYNONYM",
      "position": 0,
      "positionLength": 3
    },
    {
      "token": "web",
      "start_offset": 7,
      "end_offset": 10,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "service",
      "start_offset": 11,
      "end_offset": 18,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}

And these are the mappings:

{
  "resumes": {
    "mappings": {
      "dynamic_templates": [
        {
          "raw_as_text": {
            "path_match": "meta.raw.*",
            "mapping": {
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              },
              "type": "text"
            }
          }
        }
      ],
      "properties": {
        "attachment": {
          "type": "binary"
        },
        "attributes": {
          "properties": {
            "group": {
              "type": "keyword"
            },
            "owner": {
              "type": "keyword"
            }
          }
        },
        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "file": {
          "properties": {
            "checksum": {
              "type": "keyword"
            },
            "content_type": {
              "type": "keyword"
            },
            "created": {
              "type": "date",
              "format": "date_optional_time"
            },
            "extension": {
              "type": "keyword"
            },
            "filename": {
              "type": "keyword",
              "store": true
            },
            "filesize": {
              "type": "long"
            },
            "indexed_chars": {
              "type": "long"
            },
            "indexing_date": {
              "type": "date",
              "format": "date_optional_time"
            },
            "last_accessed": {
              "type": "date",
              "format": "date_optional_time"
            },
            "last_modified": {
              "type": "date",
              "format": "date_optional_time"
            },
            "url": {
              "type": "keyword",
              "index": false
            }
          }
        },
        "meta": {
          "properties": {
            "altitude": {
              "type": "text"
            },
            "author": {
              "type": "text"
            },
            "comments": {
              "type": "text"
            },
            "contributor": {
              "type": "text"
            },
            "coverage": {
              "type": "text"
            },
            "created": {
              "type": "date",
              "format": "date_optional_time"
            },
            "creator_tool": {
              "type": "keyword"
            },
            "date": {
              "type": "date",
              "format": "date_optional_time"
            },
            "description": {
              "type": "text"
            },
            "format": {
              "type": "text"
            },
            "identifier": {
              "type": "text"
            },
            "keywords": {
              "type": "text"
            },
            "language": {
              "type": "keyword"
            },
            "latitude": {
              "type": "text"
            },
            "longitude": {
              "type": "text"
            },
            "metadata_date": {
              "type": "date",
              "format": "date_optional_time"
            },
            "modifier": {
              "type": "text"
            },
            "print_date": {
              "type": "date",
              "format": "date_optional_time"
            },
            "publisher": {
              "type": "text"
            },
            "rating": {
              "type": "byte"
            },
            "relation": {
              "type": "text"
            },
            "rights": {
              "type": "text"
            },
            "source": {
              "type": "text"
            },
            "title": {
              "type": "text"
            },
            "type": {
              "type": "text"
            }
          }
        },
        "path": {
          "properties": {
            "real": {
              "type": "keyword",
              "fields": {
                "fulltext": {
                  "type": "text"
                },
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                }
              }
            },
            "root": {
              "type": "keyword"
            },
            "virtual": {
              "type": "keyword",
              "fields": {
                "fulltext": {
                  "type": "text"
                },
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                }
              }
            }
          }
        }
      }
    }
  }
}

Of course I remember the part with missing bracket but after you solve it it response no error.

Can I solve it in this way?

PUT /my_index/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "my_synonym_analyzer"
    }
  }
}

You need to delete the existing index and recreate it:

DELETE fscrawler
PUT fscrawler

If you still see the following when calling GET fscrawler/_mapping:

        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },

Then something is wrong. And it won’t work with synonyms as the analyzer is not properly set.

I think this is the issue.
When I execute

PUT _component_template/myindex_mapping_content
{
  "template": {
    "settings": {
      "analysis": {
        "filter": {
          "my_synonym_filter": {
            "type": "synonym",
            "synonyms_set": "my-synonym-set",
            "updateable": true
          }
        },
        "analyzer": {
          "my_synonym_analyzer": {
            "tokenizer": "standard",
            "filter": [
              "lowercase",
              "my_synonym_filter"
            ]
          }
        }
    }
  },
  "mappings": {
    "properties": {
      "content": {
        "type": "text",
        "analyzer": "my_synonym_analyzer"
      }
      }
    }
  }
}

result is

{
  "error": {
    "root_cause": [
      {
        "type": "mapper_exception",
        "reason": "analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode."
      }
    ],
    "type": "mapper_parsing_exception",
    "reason": "Failed to parse mapping: analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode.",
    "caused_by": {
      "type": "mapper_exception",
      "reason": "analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode."
    }
  },
  "status": 400
}

I can exclude

"updateable": true

and result is

{
  "acknowledged": true
}

but when I execute GET myindex/_mapping I still see

        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },

But did you run:

DELETE myindex
PUT myindex

That's it - I missed the PUT.
So everything is fine, thank you once again.

If you don't mind some questions to get it.
What is the PUT for?
After a DELETE a PUT looks a bit curious as I would expect that after a DELETE the index is deleted?

The same with PUT _component_template/myindex_mapping_content and the error result:

{
  "error": {
    "root_cause": [
      {
        "type": "mapper_exception",
        "reason": "analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode."
      }
    ],
    "type": "mapper_parsing_exception",
    "reason": "Failed to parse mapping: analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode.",
    "caused_by": {
      "type": "mapper_exception",
      "reason": "analyzer [my_synonym_analyzer] contains filters [my_synonym_filter] that are not allowed to run in index time mode."
    }
  },
  "status": 400
}

It runs fine now even if there'd been an error previously.

Anyway I'm more than happy that it works now!

You don’t have to do this manually as anyway Elasticsearch will do it behind the scene when the first document is sent by FSCrawler.

PUT index, creates the index.