Index Size , Shingle analyzer

Hi Team ,

I have wet (webpages data)files in json format downloaded in machine,
the size of the wet files are 2.5 TB in one machine , when i ingested the data to elastic with given mappings and settings , its becoming 25 TB lets say 10X more space

The following is the index template settings and mappings

settings :


{
  "index": {
    "lifecycle": {
      "name": "webcrawl-fs-policy"
    },
    "mapping": {
      "nested_fields": {
        "limit": "500"
      },
      "depth": {
        "limit": "50"
      },
      "field_name_length": {
        "limit": "1000"
      },
      "total_fields": {
        "limit": "20000"
      }
    },
    "refresh_interval": "5s",
    "number_of_shards": "20",
    "max_docvalue_fields_search": "500",
    "max_shingle_diff": "7",
    "default_pipeline": "webcrawl-pipeline",
    "analysis": {
      "filter": {
        "my_shingle_filter": {
          "max_shingle_size": "7",
          "min_shingle_size": "2",
          "type": "shingle"
        }
      },
      "analyzer": {
        "my_shingle_analyzer": {
          "filter": [
            "lowercase",
            "my_shingle_filter",
            "trim",
            "stop"
          ],
          "tokenizer": "whitespace"
        }
      }
    },
    "number_of_replicas": "0"
  }
}

mappings :


{
  "_routing": {
    "required": false
  },
  "numeric_detection": false,
  "dynamic_date_formats": [
    "strict_date_optional_time",
    "yyyy/MM/dd HH:mm:ss Z||yyyy/MM/dd Z"
  ],
  "_source": {
    "excludes": [],
    "includes": [],
    "enabled": true
  },
  "dynamic": true,
  "dynamic_templates": [],
  "date_detection": true,
  "properties": {
    "webdate": {
      "type": "date"
    },
    "webtitlepage": {
      "eager_global_ordinals": false,
      "index_phrases": false,
      "search_quote_analyzer": "my_shingle_analyzer",
      "fielddata": false,
      "norms": true,
      "analyzer": "my_shingle_analyzer",
      "index": true,
      "store": false,
      "type": "text",
      "index_options": "positions"
    },
    "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers": {
      "type": "flattened"
    },
    "html": {
      "eager_global_ordinals": false,
      "index_phrases": false,
      "search_quote_analyzer": "my_shingle_analyzer",
      "fielddata": false,
      "norms": true,
      "analyzer": "my_shingle_analyzer",
      "index": true,
      "store": false,
      "type": "text",
      "index_options": "positions"
    },
    "message": {
      "eager_global_ordinals": false,
      "index_phrases": false,
      "search_quote_analyzer": "my_shingle_analyzer",
      "fielddata": false,
      "norms": true,
      "analyzer": "my_shingle_analyzer",
      "index": true,
      "store": false,
      "type": "text",
      "index_options": "positions"
    },
    "content": {
      "eager_global_ordinals": false,
      "index_phrases": false,
      "search_quote_analyzer": "my_shingle_analyzer",
      "fielddata": false,
      "norms": true,
      "analyzer": "my_shingle_analyzer",
      "index": true,
      "store": false,
      "type": "text",
      "index_options": "positions"
    },
    "url": {
      "eager_global_ordinals": false,
      "index_phrases": false,
      "search_quote_analyzer": "my_shingle_analyzer",
      "fielddata": false,
      "norms": true,
      "analyzer": "my_shingle_analyzer",
      "index": true,
      "store": false,
      "type": "text",
      "index_options": "positions"
    }
  }
}

Now my ask here is ,

  1. I am applying custom shingle analyzer for typo correctness and popular suggest search functionalities. applying shingle size as 7 is by any chance causing my index to grow bigger in size ??
"analysis": {
      "filter": {
        "my_shingle_filter": {
          **"max_shingle_size": "7",**
**          "min_shingle_size": "2",**
          "type": "shingle"
        }
      },
      "analyzer": {
        "my_shingle_analyzer": {
          "filter": [
            "lowercase",
            "my_shingle_filter",
            "trim",
            "stop"
          ],
          "tokenizer": "whitespace"
        }
  1. lets say this setting is causing the index sizing problem , as it will store more tokens. i can reduce the default values to min 2 - max 3 as same as shown here - Shingle token filter | Elasticsearch Guide [8.13] | Elastic
    will it reduce the index sizing considerably ??

Additional points :
i. Not using any replicas
ii. using primaries as 20 for improve searchability
iii. i have 150 nodes of elasticsearch with 62 GB RAM and 8 cores CPU in all machines

  1. for production use case , what is the max shingle size suggested ??
    lets say i have 1 PB of data , in the search any user searches for any word
    example : Britney Spear instead of Britney Spears , the result of correctness should point to Britney Spears , as spear is also not a typo but Britney Spears is most popular one.

or suggest anything need to add into the mappings in order to archive this usecase

the important fields i have is

  1. url -- url of webpage
  2. content -- webpage content
  3. webdate - last modified date of web page

i was applying custom shingle analyzer on content as content will have the most useful data

Thanks in advance