How to create index for a attachment of pdf by using
elasticsearch-river-mongodb: 1.6.9 (don't have any hits,or missing fields)
Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials
and post on index and mapping attached pdf document in a mongodb database
for days without success. After running the codes below i don't have any
hits from words that exist in the mongodb attached files.
software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0
Problem No. 1
- BSON Structure, PDF attachment is in the "FileContent" field, the
attachment is not in GridFS.
byte fileser = iou.read(file);
Pagecount = getpagenum(file);
BasicDBObject articleobject = new BasicDBObject();
articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
articleobject.put("Organization",
jsonArray.getJSONObject(i).get("Organization"));
articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
articleobject.put("Pagecount", Pagecount);
articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
articleobject.put("Keywords", "");
articleobject.put("FileContent", fileser);
collection.insert(articleobject);
- create a index
curl -XPUT "http://localhost:9200/articleindex"
- create a mapping
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"}
}
}
}'
- create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch1",
"collection": "pdf"
},
"index": {
"name": "articleindex",
"type": "cardiopathy"
}
}'
- Retrieve the indexed document by the keyword
curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
{
"fields" : ["Title"],
"query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}
*Problem No. 2 *
alter mapping:
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes",
"index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'
Retrieve the indexed document by the keyword:
{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}
Problem No. 3
- The attachment is in GridFS, in addition, we define the other fields.
GridFSInputFile gfsFile = gfsPhoto.createFile(file);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
gfsFile.setFilename(filename);
gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
gfsFile.put("Pagecount", Pagecount);
gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
gfsFile.put("Keywords", "");
gfsFile.save();
- create a index
curl -XPUT "http://localhost:9200/articleindex"
- create a mapping
curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d
'{
"cardiopathy": {
"properties" : {
"content" : {
"path" : "full",
"type" : "attachment",
"fields" : {
"content" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Authors" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Organization" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Keywords" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"ISSN" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Pages" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"date" : {"format" : "dateOptionalTime","type" : "date" },
"content_type" : { "type" : "string" }
}
},
"chunkSize" : { "type" : "long" },
"md5" : { "type" : "string" },
"length" : { "type" : "long" },
"filename" : { "type" : "string" },
"contentType" : { "type" : "string" },
"uploadDate" : {
"format" : "dateOptionalTime",
"type" : "date"
},
"metadata" : { "type" : "object" }
}
}
}'
- create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch",
"collection": "fs",
"gridfs": true
},
"index": {
"name": "cardiopathyindex",
"type": "cardiopathy",
"content_type": "application/pdf"
}
}'
- Retrieve the indexed document by the keyword, hit, but the query result
is missing the "Title" and "Authors" fields.
curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
{
"fields" : ["Title","Authors"],
"query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......
在 2012年11月24日星期六UTC+8上午5时48分09秒,Sanx El Santo写道:
Greetings to all
I'm trying to integrate MongoDB with elasticsearch
'm using this guide:
A complete guide to Integrating MongoDB with Elastic Search - Satish Gandham
With following command in my terminal:
curl -XPUT 'http://localhost:9200/_river/mongodb/_meta' -d '{
"type": "mongodb",
"mongodb": {
"db": "santix",
"collection": "posts"
},
"index": {
"name": "mongoindex",
"type": "posts"
}
}'
and the output is:
{"error":"MapperParsingException[Failed to parse]; nested:
JsonParseException[Unexpected end-of-input in field name\n at [Source:
[B@1bfa1ba1; line: 2, column: 2]]; ","status":400}
and the elasticsearch output is:
[2012-11-23 10:25:12,843][DEBUG][action.index ] [Rafferty]
[_river][0], node[q2OyKdZaRXGf-SMhPGSrSw], [P], s[STARTED]: Failed to
execute [index {[_river][mongodb][_meta], source[{
??? "type": "mongodb",
??? "mongodb": {
??????? "db": "santix",
??????? "collection": "posts"
??? },
??? "index": {
??????? "name": "mongoindex",
??????? "type": "posts"
??? }
}]}]
org.elasticsearch.index.mapper.MapperParsingException: Failed to parse
at
org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:
509)
at
org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:
438)
at
org.elasticsearch.index.shard.service.InternalIndexShard.prepareIndex(InternalIndexShard.java:
-
at
org.elasticsearch.action.index.TransportIndexAction.shardOperationOnPrimary(TransportIndexAction.java:
-
at
org.elasticsearch.action.support.replication.TransportShardReplicationOperationAction
$AsyncShardOperationAction.performOnPrimary(TransportShardReplicationOperationAction.java:
- at
org.elasticsearch.action.support.replication.TransportShardReplicationOperationAction
$AsyncShardOperationAction
$1.run(TransportShardReplicationOperationAction.java:430)
at java.util.concurrent.ThreadPoolExecutor
$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor
$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:680)
Caused by: org.elasticsearch.common.jackson.core.JsonParseException:
Unexpected end-of-input in field name
at [Source: [B@1bfa1ba1; line: 2, column: 2]
at
org.elasticsearch.common.jackson.core.JsonParser._constructError(JsonParser.java:
-
at
org.elasticsearch.common.jackson.core.base.ParserMinimalBase._reportError(ParserMinimalBase.java:
-
at
org.elasticsearch.common.jackson.core.base.ParserMinimalBase._reportInvalidEOF(ParserMinimalBase.java:
-
at
org.elasticsearch.common.jackson.core.json.UTF8StreamJsonParser.addName(UTF8StreamJsonParser.java:
-
at
org.elasticsearch.common.jackson.core.json.UTF8StreamJsonParser._handleUnusualFieldName(UTF8StreamJsonParser.java:
-
at
org.elasticsearch.common.jackson.core.json.UTF8StreamJsonParser._parseFieldName(UTF8StreamJsonParser.java:
-
at
org.elasticsearch.common.jackson.core.json.UTF8StreamJsonParser.nextToken(UTF8StreamJsonParser.java:
-
at
org.elasticsearch.common.xcontent.json.JsonXContentParser.nextToken(JsonXContentParser.java:
-
at
org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:
468)
... 8 more
Can someone please help me and tell me what I'm missing or what I'm
doing wrong?
Thank you very much for your attention
--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.