and I wrote a simple spark application to display the data:
package com.es_range
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.elasticsearch.hadoop.cfg.ConfigurationOptions
object Main {
def main(args: Array[String]): Unit = {
val sparkConf =
new SparkConf()
.setAppName("test")
.set(ConfigurationOptions.ES_NODES, "localhost")
.set(ConfigurationOptions.ES_PORT, "9200")
.set(ConfigurationOptions.ES_INDEX_READ_MISSING_AS_EMPTY, "true")
sparkConf.setMaster("local[*]")
sparkConf.set("spark.driver.host", "localhost")
val spark =
SparkSession.builder
.appName("test")
.config(sparkConf)
.enableHiveSupport()
.getOrCreate()
val esDataFrame =
spark.sqlContext.read
.format("org.elasticsearch.spark.sql")
.load("range_index/_doc")
esDataFrame.show(10)
}
}
When I try to run it I get the following error: org.elasticsearch.hadoop.rest.EsHadoopParsingException: org.elasticsearch.hadoop.EsHadoopIllegalStateException: Field '_' not found; typically this occurs with arrays which are not mapped as single value
I don't understand what is going on there and why the data cannot be loaded into the dataframe. Could someone shed some light?
Is this maybe a bug?
Can you include the full stack trace for the error? That would help us track down where in the connector this is happening.
That error message should probably be something different/happening earlier in the process to be honest. At the moment, ES-Hadoop does not support range fields.
At the moment, ES-Hadoop does not support range fields.
Yep, I thought so. That would be a nice addition to the documentation
Driver stacktrace:
19/08/26 09:51:28 INFO DAGScheduler: Job 0 failed: show at Main.scala:36, took 0.366181 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): org.elasticsearch.hadoop.rest.EsHadoopParsingException: org.elasticsearch.hadoop.EsHadoopIllegalStateException: Field '_' not found; typically this occurs with arrays which are not mapped as single value
at org.elasticsearch.hadoop.serialization.ScrollReader.readHit(ScrollReader.java:514)
at org.elasticsearch.hadoop.serialization.ScrollReader.read(ScrollReader.java:292)
at org.elasticsearch.hadoop.serialization.ScrollReader.read(ScrollReader.java:262)
at org.elasticsearch.hadoop.rest.RestRepository.scroll(RestRepository.java:313)
at org.elasticsearch.hadoop.rest.ScrollQuery.hasNext(ScrollQuery.java:93)
at org.elasticsearch.spark.rdd.AbstractEsRDDIterator.hasNext(AbstractEsRDDIterator.scala:61)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.elasticsearch.hadoop.EsHadoopIllegalStateException: Field '_' not found; typically this occurs with arrays which are not mapped as single value
at org.elasticsearch.spark.sql.RowValueReader$class.rowColumns(RowValueReader.scala:51)
at org.elasticsearch.spark.sql.ScalaRowValueReader.rowColumns(ScalaEsRowValueReader.scala:32)
at org.elasticsearch.spark.sql.ScalaRowValueReader.createMap(ScalaEsRowValueReader.scala:69)
at org.elasticsearch.hadoop.serialization.ScrollReader.map(ScrollReader.java:1011)
at org.elasticsearch.hadoop.serialization.ScrollReader.read(ScrollReader.java:889)
at org.elasticsearch.hadoop.serialization.ScrollReader.readHitAsMap(ScrollReader.java:602)
at org.elasticsearch.hadoop.serialization.ScrollReader.readHit(ScrollReader.java:426)
... 27 more
Apache, Apache Lucene, Apache Hadoop, Hadoop, HDFS and the yellow elephant
logo are trademarks of the
Apache Software Foundation
in the United States and/or other countries.