Stopwords(Elasticsearch7.14+ fscrawler2.7)

how to configure stopwords. I am in fscrawler2.7 and Elasticsearch7.14

Welcome!

You need to define your own analyzer as explained in https://fscrawler.readthedocs.io/en/fscrawler-2.7/admin/fs/elasticsearch.html#creating-your-own-mapping-analyzers

I'd recommend doing it manually with (quote from docs):

Note that if you want to push manually the mapping to elasticsearch you can use the classic REST calls:

# Create index (don't forget to add the fscrawler_path analyzer)
PUT docs
{
  // Same index settings as previously seen
}

You can configure the stopword of the language analyzer you want to use. See Language analyzers | Elasticsearch Guide [8.11] | Elastic

Or for example:

PUT /docs
{
  "settings": {
    "number_of_shards": 1,
    "index.mapping.total_fields.limit": 2000,
    "analysis": {
      "tokenizer": {
        "fscrawler_path": {
          "type": "path_hierarchy"
        }
      },
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  ["_english_", "fscrawler" ]
        },
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["example"] 
        },
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"
        },
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"
        }
      },
      "analyzer": {
        "fscrawler_path": {
          "tokenizer": "fscrawler_path"
        },
        "rebuilt_english": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "english_stop",
            "english_keywords",
            "english_stemmer"
          ]
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "dynamic_templates": [
        {
          "raw_as_text": {
            "path_match": "meta.raw.*",
            "mapping": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            }
          }
        }
      ],
      "properties": {
        "attachment": {
          "type": "binary",
          "doc_values": false
        },
        "attributes": {
          "properties": {
            "group": {
              "type": "keyword"
            },
            "owner": {
              "type": "keyword"
            }
          }
        },
        "content": {
          "type": "text",
          "analyzer": "rebuilt_english"
        },
        "file": {
          "properties": {
            "content_type": {
              "type": "keyword"
            },
            "filename": {
              "type": "keyword",
              "store": true
            },
            "extension": {
              "type": "keyword"
            },
            "filesize": {
              "type": "long"
            },
            "indexed_chars": {
              "type": "long"
            },
            "indexing_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "created": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "last_modified": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "last_accessed": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "checksum": {
              "type": "keyword"
            },
            "url": {
              "type": "keyword",
              "index": false
            }
          }
        },
        "meta": {
          "properties": {
            "author": {
              "type": "text"
            },
            "date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "keywords": {
              "type": "text"
            },
            "title": {
              "type": "text"
            },
            "language": {
              "type": "keyword"
            },
            "format": {
              "type": "text"
            },
            "identifier": {
              "type": "text"
            },
            "contributor": {
              "type": "text"
            },
            "coverage": {
              "type": "text"
            },
            "modifier": {
              "type": "text"
            },
            "creator_tool": {
              "type": "keyword"
            },
            "publisher": {
              "type": "text"
            },
            "relation": {
              "type": "text"
            },
            "rights": {
              "type": "text"
            },
            "source": {
              "type": "text"
            },
            "type": {
              "type": "text"
            },
            "description": {
              "type": "text"
            },
            "created": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "print_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "metadata_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "latitude": {
              "type": "text"
            },
            "longitude": {
              "type": "text"
            },
            "altitude": {
              "type": "text"
            },
            "rating": {
              "type": "byte"
            },
            "comments": {
              "type": "text"
            }
          }
        },
        "path": {
          "properties": {
            "real": {
              "type": "keyword",
              "fields": {
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                },
                "fulltext": {
                  "type": "text"
                }
              }
            },
            "root": {
              "type": "keyword"
            },
            "virtual": {
              "type": "keyword",
              "fields": {
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                },
                "fulltext": {
                  "type": "text"
                }
              }
            }
          }
        }
      }
    }
  }
}

Thank you

Hello,
i'm beginner in elasticsearch and fscrawler, i can't configure stopwords i use elasticsearch 7.14 and fscrawler2.7, can you help me please?

I think that's what I did already.

What is the problem?

the problem is
{
"error" : {
"root_cause" : [
{
"type" : "illegal_argument_exception",
"reason" : "The mapping definition cannot be nested under a type [_doc] unless include_type_name is set to true."
}
],
"type" : "illegal_argument_exception",
"reason" : "The mapping definition cannot be nested under a type [_doc] unless include_type_name is set to true."
},
"status" : 400
}

Could you try:

PUT /docs
{
  "settings": {
    "number_of_shards": 1,
    "index.mapping.total_fields.limit": 2000,
    "analysis": {
      "tokenizer": {
        "fscrawler_path": {
          "type": "path_hierarchy"
        }
      },
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  ["_english_", "fscrawler" ]
        },
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["example"] 
        },
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"
        },
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"
        }
      },
      "analyzer": {
        "fscrawler_path": {
          "tokenizer": "fscrawler_path"
        },
        "rebuilt_english": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "english_stop",
            "english_keywords",
            "english_stemmer"
          ]
        }
      }
    }
  },
  "mappings": {
      "dynamic_templates": [
        {
          "raw_as_text": {
            "path_match": "meta.raw.*",
            "mapping": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            }
          }
        }
      ],
      "properties": {
        "attachment": {
          "type": "binary",
          "doc_values": false
        },
        "attributes": {
          "properties": {
            "group": {
              "type": "keyword"
            },
            "owner": {
              "type": "keyword"
            }
          }
        },
        "content": {
          "type": "text",
          "analyzer": "rebuilt_english"
        },
        "file": {
          "properties": {
            "content_type": {
              "type": "keyword"
            },
            "filename": {
              "type": "keyword",
              "store": true
            },
            "extension": {
              "type": "keyword"
            },
            "filesize": {
              "type": "long"
            },
            "indexed_chars": {
              "type": "long"
            },
            "indexing_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "created": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "last_modified": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "last_accessed": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "checksum": {
              "type": "keyword"
            },
            "url": {
              "type": "keyword",
              "index": false
            }
          }
        },
        "meta": {
          "properties": {
            "author": {
              "type": "text"
            },
            "date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "keywords": {
              "type": "text"
            },
            "title": {
              "type": "text"
            },
            "language": {
              "type": "keyword"
            },
            "format": {
              "type": "text"
            },
            "identifier": {
              "type": "text"
            },
            "contributor": {
              "type": "text"
            },
            "coverage": {
              "type": "text"
            },
            "modifier": {
              "type": "text"
            },
            "creator_tool": {
              "type": "keyword"
            },
            "publisher": {
              "type": "text"
            },
            "relation": {
              "type": "text"
            },
            "rights": {
              "type": "text"
            },
            "source": {
              "type": "text"
            },
            "type": {
              "type": "text"
            },
            "description": {
              "type": "text"
            },
            "created": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "print_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "metadata_date": {
              "type": "date",
              "format": "dateOptionalTime"
            },
            "latitude": {
              "type": "text"
            },
            "longitude": {
              "type": "text"
            },
            "altitude": {
              "type": "text"
            },
            "rating": {
              "type": "byte"
            },
            "comments": {
              "type": "text"
            }
          }
        },
        "path": {
          "properties": {
            "real": {
              "type": "keyword",
              "fields": {
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                },
                "fulltext": {
                  "type": "text"
                }
              }
            },
            "root": {
              "type": "keyword"
            },
            "virtual": {
              "type": "keyword",
              "fields": {
                "tree": {
                  "type": "text",
                  "analyzer": "fscrawler_path",
                  "fielddata": true
                },
                "fulltext": {
                  "type": "text"
                }
              }
            }
          }
        }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.