Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials
and post on index and mapping attached pdf document in a mongodb database
for days without success. After running the codes below i don't have any
hits from words that exist in the mongodb attached files.
software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0
Problem No. 1
- BSON Structure, PDF attachment is in the "FileContent" field, the
attachment is not in GridFS.
byte [] fileser = iou.read(file);
Pagecount = getpagenum(file);
BasicDBObject articleobject = new BasicDBObject();
articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
articleobject.put("Organization",
jsonArray.getJSONObject(i).get("Organization"));
articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
articleobject.put("Pagecount", Pagecount);
articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
articleobject.put("Keywords", "");
articleobject.put("FileContent", fileser);
collection.insert(articleobject);
- create a index
curl -XPUT "http://localhost:9200/articleindex"
- create a mapping
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"}
}
}
}'
- create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch1",
"collection": "pdf"
},
"index": {
"name": "articleindex",
"type": "cardiopathy"
}
}'
- Retrieve the indexed document by the keyword
curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
{
"fields" : ["Title"],
"query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}
Problem No. 2
alter mapping:
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes",
"index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'
Retrieve the indexed document by the keyword:
{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}
Problem No. 3
- The attachment is in GridFS, in addition, we define the other fields.
GridFSInputFile gfsFile = gfsPhoto.createFile(file);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
gfsFile.setFilename(filename);
gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
gfsFile.put("Pagecount", Pagecount);
gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
gfsFile.put("Keywords", "");
gfsFile.save();
- create a index
curl -XPUT "http://localhost:9200/articleindex"
- create a mapping
curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d
'{
"cardiopathy": {
"properties" : {
"content" : {
"path" : "full",
"type" : "attachment",
"fields" : {
"content" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Authors" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Organization" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Keywords" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"ISSN" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Pages" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"date" : {"format" : "dateOptionalTime","type" : "date" },
"content_type" : { "type" : "string" }
}
},
"chunkSize" : { "type" : "long" },
"md5" : { "type" : "string" },
"length" : { "type" : "long" },
"filename" : { "type" : "string" },
"contentType" : { "type" : "string" },
"uploadDate" : {
"format" : "dateOptionalTime",
"type" : "date"
},
"metadata" : { "type" : "object" }
}
}
}'
- create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch",
"collection": "fs",
"gridfs": true
},
"index": {
"name": "cardiopathyindex",
"type": "cardiopathy",
"content_type": "application/pdf"
}
}'
- Retrieve the indexed document by the keyword, hit, but the query result
is missing the "Title" and "Authors" fields.
curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
{
"fields" : ["Title","Authors"],
"query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......
--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.