How to create index for a attachment of pdf by using elasticsearch-river-mongodb: 1.6.9 (don't have any hits,or missing fields)

Jordon · July 8, 2013, 10:38am

Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials
and post on index and mapping attached pdf document in a mongodb database
for days without success. After running the codes below i don't have any
hits from words that exist in the mongodb attached files.

software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0

Problem No. 1

BSON Structure, PDF attachment is in the "FileContent" field, the
attachment is not in GridFS.
byte [] fileser = iou.read(file);
Pagecount = getpagenum(file);
BasicDBObject articleobject = new BasicDBObject();
articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
articleobject.put("Organization",
jsonArray.getJSONObject(i).get("Organization"));
articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
articleobject.put("Pagecount", Pagecount);
articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
articleobject.put("Keywords", "");
articleobject.put("FileContent", fileser);
collection.insert(articleobject);

create a index
curl -XPUT "http://localhost:9200/articleindex"

create a mapping
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"}
}
}
}'

create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch1",
"collection": "pdf"
},
"index": {
"name": "articleindex",
"type": "cardiopathy"
}
}'

Retrieve the indexed document by the keyword
curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
{
"fields" : ["Title"],
"query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'

{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}

Problem No. 2

alter mapping：

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes",
"index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

Retrieve the indexed document by the keyword：

{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}

Problem No. 3

The attachment is in GridFS, in addition, we define the other fields.
GridFSInputFile gfsFile = gfsPhoto.createFile(file);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
gfsFile.setFilename(filename);
gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
gfsFile.put("Pagecount", Pagecount);
gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
gfsFile.put("Keywords", "");
gfsFile.save();

create a index
curl -XPUT "http://localhost:9200/articleindex"

create a mapping
curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d
'{
"cardiopathy": {
"properties" : {
"content" : {
"path" : "full",
"type" : "attachment",
"fields" : {
"content" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Authors" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Organization" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Keywords" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"ISSN" : {"type": "string","indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Pages" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer":
"ik","searchAnalyzer": "ik"},
"date" : {"format" : "dateOptionalTime","type" : "date" },
"content_type" : { "type" : "string" }
}
},
"chunkSize" : { "type" : "long" },
"md5" : { "type" : "string" },
"length" : { "type" : "long" },
"filename" : { "type" : "string" },
"contentType" : { "type" : "string" },
"uploadDate" : {
"format" : "dateOptionalTime",
"type" : "date"
},
"metadata" : { "type" : "object" }
}
}
}'

create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch",
"collection": "fs",
"gridfs": true
},
"index": {
"name": "cardiopathyindex",
"type": "cardiopathy",
"content_type": "application/pdf"
}
}'

Retrieve the indexed document by the keyword, hit, but the query result
is missing the "Title" and "Authors" fields.
curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
{
"fields" : ["Title","Authors"],
"query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.

dadoonet · July 8, 2013, 2:19pm

I think you should first create the mapping (don't alter mapping as it will basically works only for new fields and not on existing ones):

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes", "index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

HTH

--
David Pilato | Technical Advocate | Elasticsearch.com
@dadoonet | @elasticsearchfr | @scrutmydocs

Le 8 juil. 2013 à 12:38, Jordon quwu.ustb@gmail.com a écrit :

Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials and post on index and mapping attached pdf document in a mongodb database for days without success. After running the codes below i don't have any hits from words that exist in the mongodb attached files.

software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0

Problem No. 1

BSON Structure, PDF attachment is in the "FileContent" field, the attachment is not in GridFS.
byte fileser = iou.read(file);
Pagecount = getpagenum(file);
BasicDBObject articleobject = new BasicDBObject();
articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
articleobject.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
articleobject.put("Pagecount", Pagecount);
articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
articleobject.put("Keywords", "");
articleobject.put("FileContent", fileser);
collection.insert(articleobject);
   create a index
curl -XPUT "http://localhost:9200/articleindex"
   create a mapping
curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"}
}
}
}'
create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch1",
"collection": "pdf"
},
"index": {
"name": "articleindex",
"type": "cardiopathy"
}
}'

Retrieve the indexed document by the keyword
curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
{
"fields" : ["Title"],
"query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'

{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}

Problem No. 2

alter mapping：

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes", "index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

Retrieve the indexed document by the keyword：

{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}

Problem No. 3
   The attachment is in GridFS, in addition, we define the other fields.
GridFSInputFile gfsFile = gfsPhoto.createFile(file);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
gfsFile.setFilename(filename);
gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
gfsFile.put("Pagecount", Pagecount);
gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
gfsFile.put("Keywords", "");
gfsFile.save();
   create a index
curl -XPUT "http://localhost:9200/articleindex"
   create a mapping
curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d '{
"cardiopathy": {
"properties" : {
"content" : {
"path" : "full",
"type" : "attachment",
"fields" : {
"content" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"date" : {"format" : "dateOptionalTime","type" : "date" },
"content_type" : { "type" : "string" }
}
},
"chunkSize" : { "type" : "long" },
"md5" : { "type" : "string" },
"length" : { "type" : "long" },
"filename" : { "type" : "string" },
"contentType" : { "type" : "string" },
"uploadDate" : {
"format" : "dateOptionalTime",
"type" : "date"
},
"metadata" : { "type" : "object" }
}
}
}'
create the river
curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch",
"collection": "fs",
"gridfs": true
},
"index": {
"name": "cardiopathyindex",
"type": "cardiopathy",
"content_type": "application/pdf"
}
}'

Retrieve the indexed document by the keyword, hit, but the query result is missing the "Title" and "Authors" fields.
curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
{
"fields" : ["Title","Authors"],
"query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
}
'
{"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.

Topic		Replies	Views
How to create index for a attachment of pdf by using elasticsearch-river-couchdb(1.2.0) (don't have any hits) Elasticsearch	7	554	July 6, 2017
Attachments questions Elasticsearch	2	252	July 6, 2017
Indexing pdf documents Elasticsearch	2	5196	December 27, 2016
Elasticsearch mapper attachment issues Elasticsearch	16	2352	July 5, 2017
Elasticsearch mongodb river with GridFS attached to DBObject problem Elasticsearch	1	681	July 6, 2017

How to create index for a attachment of pdf by using elasticsearch-river-mongodb: 1.6.9 (don't have any hits,or missing fields)

Related topics