How to use FSCrawler to send elasticsearch Base64 encoded PDF?

aeronesto · March 16, 2018, 7:06pm

Trying to index a PDF into elasticsearch using FSCrawler, since elasticsearch needs Base64 encoded data. I have the following job created in FSCrawler:

{
  "name" : "my_job_name",
  "fs" : {
    "url" : "/tmp/es",
    "update_rate" : "15m",
    "includes" : [ "*.pdf" ],
    "json_support" : false,
    "filename_as_id" : false,
    "add_filesize" : true,
    "remove_deleted" : true,
    "add_as_inner_object" : false,
    "store_source" : false,
    "index_content" : true,
    "attributes_support" : false,
    "raw_metadata" : false,
    "xml_support" : false,
    "index_folders" : true,
    "lang_detect" : false,
    "continue_on_error" : false,
    "pdf_ocr" : true,
    "ocr" : {
      "language" : "eng"
    }
  },
  "elasticsearch" : {
    "nodes" : [ {
      "host" : "127.0.0.1",
      "port" : 9200,
      "scheme" : "HTTP"
    } ],
    "bulk_size" : 100,
    "flush_interval" : "5s",
    "pipeline" : "my_pipeline_id"
  },
  "rest" : {
    "scheme" : "HTTP",
    "host" : "127.0.0.1",
    "port" : 8080,
    "endpoint" : "fscrawler"
  }
}

("my_pipeline_id" is a placeholder, for now. I don't have a pipeline created.)

How would I create my pipeline to send elasticsearch Base64 encoded PDF from FSCrawler?

When I run FSCrawler on one PDF, this is what I get in the ES index:

{
  "my_job_name" : {
    "aliases" : { },
    "mappings" : {
      "doc" : {
        "properties" : {
          "attachment" : {
            "type" : "binary"
          },
          "attributes" : {
            "properties" : {
              "group" : {
                "type" : "keyword"
              },
              "owner" : {
                "type" : "keyword"
              }
            }
          },
          "content" : {
            "type" : "text"
          },
          "file" : {
            "properties" : {
              "checksum" : {
                "type" : "keyword"
              },
              "content_type" : {
                "type" : "keyword"
              },
              "extension" : {
                "type" : "keyword"
              },
              "filename" : {
                "type" : "keyword"
              },
              "filesize" : {
                "type" : "long"
              },
              "indexed_chars" : {
                "type" : "long"
              },
              "indexing_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "last_modified" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "url" : {
                "type" : "keyword",
                "index" : false
              }
            }
          },
          "meta" : {
            "properties" : {
              "altitude" : {
                "type" : "text"
              },
              "author" : {
                "type" : "text"
              },
              "comments" : {
                "type" : "text"
              },
              "contributor" : {
                "type" : "text"
              },
              "coverage" : {
                "type" : "text"
              },
              "created" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "creator_tool" : {
                "type" : "keyword"
              },
              "date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "description" : {
                "type" : "text"
              },
              "format" : {
                "type" : "text"
              },
              "identifier" : {
                "type" : "text"
              },
              "keywords" : {
                "type" : "text"
              },
              "language" : {
                "type" : "keyword"
              },
              "latitude" : {
                "type" : "text"
              },
              "longitude" : {
                "type" : "text"
              },
              "metadata_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "modifier" : {
                "type" : "text"
              },
              "print_date" : {
                "type" : "date",
                "format" : "dateOptionalTime"
              },
              "publisher" : {
                "type" : "text"
              },
              "rating" : {
                "type" : "byte"
              },
              "relation" : {
                "type" : "text"
              },
              "rights" : {
                "type" : "text"
              },
              "source" : {
                "type" : "text"
              },
              "title" : {
                "type" : "text"
              },
              "type" : {
                "type" : "text"
              }
            }
          },
          "path" : {
            "properties" : {
              "real" : {
                "type" : "keyword",
                "fields" : {
                  "tree" : {
                    "type" : "text",
                    "analyzer" : "fscrawler_path",
                    "fielddata" : true
                  }
                }
              },
              "root" : {
                "type" : "keyword"
              },
              "virtual" : {
                "type" : "keyword",
                "fields" : {
                  "tree" : {
                    "type" : "text",
                    "analyzer" : "fscrawler_path",
                    "fielddata" : true
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

dadoonet · April 13, 2018, 1:45pm

What are the logs of fscrawler?

("my_pipeline_id" is a placeholder, for now. I don't have a pipeline created.)

This will fail at index time if the pipeline does not exist.

this is what I get in the ES index:

This is the result of a

GET my_job_name

But to know if documents have been indexed, run:

GET my_job_name/_search

aeronesto · April 17, 2018, 7:30pm

Thank you for the response. I figured it out; responding to help other readers.

I wasn't configuring my settings properly in the "job_name" folder created after starting up fscrawler with:

$ bin/fscrawler --config_dir ./test job_name

I had to set the "fs.url" to the absolute path where my PDFs were stored in my local filesystem.

{
  "name" : "job_name",
  "fs" : {
    "url" : "absolute/path/to/PDFs",
     ...
  }
  ...
}

I started up my ES index

bin/elasticsearch

PUT a pipeline to my index (using JavaScript client)

client.ingest.putPipeline({
   id: "my-pipeline-id",
   ...
})

reconfigured my _settings.json for job_name

{
  "name" : "job_name",
  "fs" : {
    "url" : "absolute/path/to/PDFs",
     ...
  }
  "elasticsearch" : {
    "pipeline" : "my-pipeline-id",
     ...
  }
  ...
}

and restarted fscrawler

$bin/fscrawler --config_dir ./test job_name.

Result: running

GET job_name/_search

returned all documents from my job_name index.

dadoonet · April 17, 2018, 7:53pm

What your pipeline is doing?

aeronesto · April 19, 2018, 1:30am

Not much, at the moment; just remove some fields, format others, and set new ones.

client.ingest.putPipeline({
  id: 'pipeline-id',
  body: {
    "description" : "parse pdfs and index into ES",
    "processors" : [
      {
        "remove" : {
          "field" : ["path", "file"],
          "tag" : "remove_fields"
        }
      },
      {
        "split" : {
          "field" : "_source.meta.author",
          "separator" : "\\|",
          "tag" : "split_author"
        }
      },
      {
        "foreach" : {
          "field" : "_source.meta.author",
          "processor" : {
            "trim" : {
              "field" : "_ingest._value"
            }
          },
          "tag" : "trim_author"
        }
      },
      {
        "split" : {
          "field" : "_source.meta.created",
          "separator" : "\\-",
          "tag" : "split_created"
        }
      },
      {
        "set" : { "field" : "difficulty", "value" : 3, "tag" : "set_difficulty" }
      },
      {
        "set" : { "field" : "topics", "value" : "Unknown", "tag" : "set_topics" }
      },
      {
        "script": {
          "lang": "painless",
          "source": "ctx.published = ctx.meta.created[0]",
          "tag" : "set_published_date"
        }
      },
      {
        "script": {
          "lang": "painless",
          "source": "ctx.course = ctx.meta.title.substring(0, 32)",
          "tag" : "course"
        }
      }
    ]
  }
})
.then(function () {
   console.log("putPipeline Resolved");
 })
.catch(function (error) {
   console.log("putPipeline error: " + error);
 });

system · May 17, 2018, 1:30am

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Fscrawler Elasticsearch	2	2956	September 28, 2017
Indexing PDF file in ElasticSearch using Java Code Elasticsearch	2	2640	August 28, 2018
FSCrawler Question Elasticsearch	7	3125	March 17, 2017
Indexing many pdf files Elasticsearch	12	8373	June 16, 2018
Indexing word, pdf documents? Elasticsearch	12	6855	July 7, 2020

How to use FSCrawler to send elasticsearch Base64 encoded PDF?

Related topics