How to use FSCrawler to send elasticsearch Base64 encoded PDF?

Trying to index a PDF into elasticsearch using FSCrawler, since elasticsearch needs Base64 encoded data. I have the following job created in FSCrawler:

{
  "name" : "my_job_name",
  "fs" : {
    "url" : "/tmp/es",
    "update_rate" : "15m",
    "includes" : [ "*.pdf" ],
    "json_support" : false,
    "filename_as_id" : false,
    "add_filesize" : true,
    "remove_deleted" : true,
    "add_as_inner_object" : false,
    "store_source" : false,
    "index_content" : true,
    "attributes_support" : false,
    "raw_metadata" : false,
    "xml_support" : false,
    "index_folders" : true,
    "lang_detect" : false,
    "continue_on_error" : false,
    "pdf_ocr" : true,
    "ocr" : {
      "language" : "eng"
    }
  },
  "elasticsearch" : {
    "nodes" : [ {
      "host" : "127.0.0.1",
      "port" : 9200,
      "scheme" : "HTTP"
    } ],
    "bulk_size" : 100,
    "flush_interval" : "5s",
    "pipeline" : "my_pipeline_id"
  },
  "rest" : {
    "scheme" : "HTTP",
    "host" : "127.0.0.1",
    "port" : 8080,
    "endpoint" : "fscrawler"
  }
}

("my_pipeline_id" is a placeholder, for now. I don't have a pipeline created.)

How would I create my pipeline to send elasticsearch Base64 encoded PDF from FSCrawler?

When I run FSCrawler on one PDF, this is what I get in the ES index:

{
  "my_job_name" : {
    "aliases" : { },
    "mappings" : {
      "doc" : {
        "properties" : {
          "attachment" : {
            "type" : "binary"
          },
          "attributes" : {
            "properties" : {
              "group" : {
                "type" : "keyword"
              },
              "owner" : {
                "type" : "keyword"
              }
            }
          },
          "content" : {
            "type" : "text"
          },
          "file" : {
            "properties" : {
              "checksum" : {
                "type" : "keyword"
              },
              "content_type" : {
                "type" : "keyword"
              },
              "extension" : {
                "type" : "keyword"
              },
              "filename" : {
                "type" : "keyword"
              },
              "filesize" : {
                "type" : "long"
              },
              "indexed_chars" : {
                "type" : "long"
              },
              "indexing_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "last_modified" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "url" : {
                "type" : "keyword",
                "index" : false
              }
            }
          },
          "meta" : {
            "properties" : {
              "altitude" : {
                "type" : "text"
              },
              "author" : {
                "type" : "text"
              },
              "comments" : {
                "type" : "text"
              },
              "contributor" : {
                "type" : "text"
              },
              "coverage" : {
                "type" : "text"
              },
              "created" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "creator_tool" : {
                "type" : "keyword"
              },
              "date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "description" : {
                "type" : "text"
              },
              "format" : {
                "type" : "text"
              },
              "identifier" : {
                "type" : "text"
              },
              "keywords" : {
                "type" : "text"
              },
              "language" : {
                "type" : "keyword"
              },
              "latitude" : {
                "type" : "text"
              },
              "longitude" : {
                "type" : "text"
              },
              "metadata_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "modifier" : {
                "type" : "text"
              },
              "print_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "publisher" : {
                "type" : "text"
              },
              "rating" : {
                "type" : "byte"
              },
              "relation" : {
                "type" : "text"
              },
              "rights" : {
                "type" : "text"
              },
              "source" : {
                "type" : "text"
              },
              "title" : {
                "type" : "text"
              },
              "type" : {
                "type" : "text"
              }
            }
          },
          "path" : {
            "properties" : {
              "real" : {
                "type" : "keyword",
                "fields" : {
                  "tree" : {
                    "type" : "text",
                    "analyzer" : "fscrawler_path",
                    "fielddata" : true
                  }
                }
              },
              "root" : {
                "type" : "keyword"
              },
              "virtual" : {
                "type" : "keyword",
                "fields" : {
                  "tree" : {
                    "type" : "text",
                    "analyzer" : "fscrawler_path",
                    "fielddata" : true
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

What are the logs of fscrawler?

("my_pipeline_id" is a placeholder, for now. I don't have a pipeline created.)

This will fail at index time if the pipeline does not exist.

this is what I get in the ES index:

This is the result of a

GET my_job_name

But to know if documents have been indexed, run:

GET my_job_name/_search

Thank you for the response. I figured it out; responding to help other readers.

I wasn't configuring my settings properly in the "job_name" folder created after starting up fscrawler with:

$ bin/fscrawler --config_dir ./test job_name

I had to set the "fs.url" to the absolute path where my PDFs were stored in my local filesystem.

{
  "name" : "job_name",
  "fs" : {
    "url" : "absolute/path/to/PDFs",
     ...
  }
  ...
}

I started up my ES index

bin/elasticsearch

PUT a pipeline to my index (using JavaScript client)

client.ingest.putPipeline({
   id: "my-pipeline-id",
   ...
})

reconfigured my _settings.json for job_name

{
  "name" : "job_name",
  "fs" : {
    "url" : "absolute/path/to/PDFs",
     ...
  }
  "elasticsearch" : {
    "pipeline" : "my-pipeline-id",
     ...
  }
  ...
}

and restarted fscrawler

$bin/fscrawler --config_dir ./test job_name.

Result: running

GET job_name/_search

returned all documents from my job_name index.

What your pipeline is doing?

Not much, at the moment; just remove some fields, format others, and set new ones.

client.ingest.putPipeline({
  id: 'pipeline-id',
  body: {
    "description" : "parse pdfs and index into ES",
    "processors" : [
      {
        "remove" : {
          "field" : ["path", "file"],
          "tag" : "remove_fields"
        }
      },
      {
        "split" : {
          "field" : "_source.meta.author",
          "separator" : "\\|",
          "tag" : "split_author"
        }
      },
      {
        "foreach" : {
          "field" : "_source.meta.author",
          "processor" : {
            "trim" : {
              "field" : "_ingest._value"
            }
          },
          "tag" : "trim_author"
        }
      },
      {
        "split" : {
          "field" : "_source.meta.created",
          "separator" : "\\-",
          "tag" : "split_created"
        }
      },
      {
        "set" : { "field" : "difficulty", "value" : 3, "tag" : "set_difficulty" }
      },
      {
        "set" : { "field" : "topics", "value" : "Unknown", "tag" : "set_topics" }
      },
      {
        "script": {
          "lang": "painless",
          "source": "ctx.published = ctx.meta.created[0]",
          "tag" : "set_published_date"
        }
      },
      {
        "script": {
          "lang": "painless",
          "source": "ctx.course = ctx.meta.title.substring(0, 32)",
          "tag" : "course"
        }
      }
    ]
  }
})
.then(function () {
   console.log("putPipeline Resolved");
 })
.catch(function (error) {
   console.log("putPipeline error: " + error);
 });
1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.