How can I correctly index @screen_name, #hashtag and url in Japanese text?

What I need to do

  • Use kuromoji as a tokenizer and an analyzer for handling Japanese fulltext search
  • Recognize tokens that match /\b@\w{1,30}\b/ (screen_name)
  • Recognize tokens that match URL Pattern
  • Recognize tokens that match /\b[##][^\p{Zs}]{1,50}(?=\p{Zs}|$)/ (hashtag)
  • Highlight matches

How can I achieve all of them at once?

Current index defintion

Loughly like this:

{
  "app": {
    "mappings": {
      "_doc": {
        "dynamic": "false",
        "_source": {
          "enabled": false
        },
        "properties": {
          "type": {
            "type": "keyword"
          },
          "text": {
            "type": "text",
            "store": true,
            "analyzer": "kuromoji_analyzer"
          },
        }
      }
    },
    "settings": {
      "index": {
        "number_of_shards": "1",
        "provided_name": "app",
        "creation_date": "1536550044056",
        "analysis": {
          "analyzer": {
            "kuromoji_analyzer": {
              "filter": [
                "kuromoji_baseform",
                "kuromoji_part_of_speech",
                "cjk_width",
                "stop",
                "ja_stop",
                "kuromoji_stemmer",
                "lowercase"
              ],
              "type": "custom",
              "tokenizer": "kuromoji_tokenizer_search"
            },
          },
          "tokenizer": {
            "kuromoji_tokenizer_search": {
              "mode": "search",
              "type": "kuromoji_tokenizer",
              "discard_punctuation": "true"
            }
          }
        },
        "number_of_replicas": "0",
        "uuid": "AYdPqQSZTjCBJkopeG1_5Q",
        "version": {
          "created": "6030299"
        }
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.