Trying to index a PDF into elasticsearch using FSCrawler, since elasticsearch needs Base64 encoded data. I have the following job created in FSCrawler:
{
"name" : "my_job_name",
"fs" : {
"url" : "/tmp/es",
"update_rate" : "15m",
"includes" : [ "*.pdf" ],
"json_support" : false,
"filename_as_id" : false,
"add_filesize" : true,
"remove_deleted" : true,
"add_as_inner_object" : false,
"store_source" : false,
"index_content" : true,
"attributes_support" : false,
"raw_metadata" : false,
"xml_support" : false,
"index_folders" : true,
"lang_detect" : false,
"continue_on_error" : false,
"pdf_ocr" : true,
"ocr" : {
"language" : "eng"
}
},
"elasticsearch" : {
"nodes" : [ {
"host" : "127.0.0.1",
"port" : 9200,
"scheme" : "HTTP"
} ],
"bulk_size" : 100,
"flush_interval" : "5s",
"pipeline" : "my_pipeline_id"
},
"rest" : {
"scheme" : "HTTP",
"host" : "127.0.0.1",
"port" : 8080,
"endpoint" : "fscrawler"
}
}
("my_pipeline_id" is a placeholder, for now. I don't have a pipeline created.)
How would I create my pipeline to send elasticsearch Base64 encoded PDF from FSCrawler?
When I run FSCrawler on one PDF, this is what I get in the ES index:
{
"my_job_name" : {
"aliases" : { },
"mappings" : {
"doc" : {
"properties" : {
"attachment" : {
"type" : "binary"
},
"attributes" : {
"properties" : {
"group" : {
"type" : "keyword"
},
"owner" : {
"type" : "keyword"
}
}
},
"content" : {
"type" : "text"
},
"file" : {
"properties" : {
"checksum" : {
"type" : "keyword"
},
"content_type" : {
"type" : "keyword"
},
"extension" : {
"type" : "keyword"
},
"filename" : {
"type" : "keyword"
},
"filesize" : {
"type" : "long"
},
"indexed_chars" : {
"type" : "long"
},
"indexing_date" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"last_modified" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"url" : {
"type" : "keyword",
"index" : false
}
}
},
"meta" : {
"properties" : {
"altitude" : {
"type" : "text"
},
"author" : {
"type" : "text"
},
"comments" : {
"type" : "text"
},
"contributor" : {
"type" : "text"
},
"coverage" : {
"type" : "text"
},
"created" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"creator_tool" : {
"type" : "keyword"
},
"date" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"description" : {
"type" : "text"
},
"format" : {
"type" : "text"
},
"identifier" : {
"type" : "text"
},
"keywords" : {
"type" : "text"
},
"language" : {
"type" : "keyword"
},
"latitude" : {
"type" : "text"
},
"longitude" : {
"type" : "text"
},
"metadata_date" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"modifier" : {
"type" : "text"
},
"print_date" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"publisher" : {
"type" : "text"
},
"rating" : {
"type" : "byte"
},
"relation" : {
"type" : "text"
},
"rights" : {
"type" : "text"
},
"source" : {
"type" : "text"
},
"title" : {
"type" : "text"
},
"type" : {
"type" : "text"
}
}
},
"path" : {
"properties" : {
"real" : {
"type" : "keyword",
"fields" : {
"tree" : {
"type" : "text",
"analyzer" : "fscrawler_path",
"fielddata" : true
}
}
},
"root" : {
"type" : "keyword"
},
"virtual" : {
"type" : "keyword",
"fields" : {
"tree" : {
"type" : "text",
"analyzer" : "fscrawler_path",
"fielddata" : true
}
}
}
}
}
}
}
}
}
}