How to create index for a attachment of pdf by using elasticsearch-river-mongodb: 1.6.9 (don't have any hits,or missing fields)

Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials
and post on index and mapping attached pdf document in a mongodb database
for days without success. After running the codes below i don't have any
hits from words that exist in the mongodb attached files.

software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0

Problem No. 1


  1. BSON Structure, PDF attachment is in the "FileContent" field, the
    attachment is not in GridFS.
    byte [] fileser = iou.read(file);
    Pagecount = getpagenum(file);
    BasicDBObject articleobject = new BasicDBObject();
    articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
    articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
    articleobject.put("Organization",
    jsonArray.getJSONObject(i).get("Organization"));
    articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
    articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
    articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
    articleobject.put("Pagecount", Pagecount);
    articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
    articleobject.put("Keywords", "");
    articleobject.put("FileContent", fileser);
    collection.insert(articleobject);

  1. create a index
    curl -XPUT "http://localhost:9200/articleindex"

  1. create a mapping
    curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
    {
    "cardiopathy" : {
    "properties" : {
    "Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
    "ik","store" : "yes"},
    "FileContent" : { "type" : "string" ,"indexAnalyzer":
    "ik","searchAnalyzer": "ik"}
    }
    }
    }'

  1. create the river
    curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
    {
    "type": "mongodb",
    "mongodb": {
    "host": "192.168.1.112",
    "port": "27107",
    "options": {"drop_collection": true },
    "db": "ftsearch1",
    "collection": "pdf"
    },
    "index": {
    "name": "articleindex",
    "type": "cardiopathy"
    }
    }'

  1. Retrieve the indexed document by the keyword
    curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
    {
    "fields" : ["Title"],
    "query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
    }
    '

{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}

Problem No. 2


alter mapping:

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer":
"ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes",
"index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

Retrieve the indexed document by the keyword:

{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":[]}}

Problem No. 3


  1. The attachment is in GridFS, in addition, we define the other fields.
    GridFSInputFile gfsFile = gfsPhoto.createFile(file);
    String filename = file.getName();
    filename = filename.substring(0, filename.lastIndexOf("."));
    gfsFile.setFilename(filename);
    gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
    gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
    gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
    gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
    gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
    gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
    gfsFile.put("Pagecount", Pagecount);
    gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
    gfsFile.put("Keywords", "");
    gfsFile.save();

  1. create a index
    curl -XPUT "http://localhost:9200/articleindex"

  1. create a mapping
    curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d
    '{
    "cardiopathy": {
    "properties" : {
    "content" : {
    "path" : "full",
    "type" : "attachment",
    "fields" : {
    "content" : {"type": "string","indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Authors" : {"type": "string","indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
    "Organization" : {"type": "string","indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Keywords" : { "type" : "string" ,"indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
    "ISSN" : {"type": "string","indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Pages" : { "type" : "string" ,"indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "Abstracts" : { "type" : "string" ,"indexAnalyzer":
    "ik","searchAnalyzer": "ik"},
    "date" : {"format" : "dateOptionalTime","type" : "date" },
    "content_type" : { "type" : "string" }
    }
    },
    "chunkSize" : { "type" : "long" },
    "md5" : { "type" : "string" },
    "length" : { "type" : "long" },
    "filename" : { "type" : "string" },
    "contentType" : { "type" : "string" },
    "uploadDate" : {
    "format" : "dateOptionalTime",
    "type" : "date"
    },
    "metadata" : { "type" : "object" }
    }
    }
    }'

  1. create the river
    curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
    {
    "type": "mongodb",
    "mongodb": {
    "host": "192.168.1.112",
    "port": "27107",
    "options": {"drop_collection": true },
    "db": "ftsearch",
    "collection": "fs",
    "gridfs": true
    },
    "index": {
    "name": "cardiopathyindex",
    "type": "cardiopathy",
    "content_type": "application/pdf"
    }
    }'

  1. Retrieve the indexed document by the keyword, hit, but the query result
    is missing the "Title" and "Authors" fields.
    curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
    {
    "fields" : ["Title","Authors"],
    "query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
    }
    '
    {"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.

I think you should first create the mapping (don't alter mapping as it will basically works only for new fields and not on existing ones):

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes", "index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

HTH

--
David Pilato | Technical Advocate | Elasticsearch.com
@dadoonet | @elasticsearchfr | @scrutmydocs

Le 8 juil. 2013 à 12:38, Jordon quwu.ustb@gmail.com a écrit :

Dear All,
I am new to elasticsearch. I have tried to follow the different tutorials and post on index and mapping attached pdf document in a mongodb database for days without success. After running the codes below i don't have any hits from words that exist in the mongodb attached files.

software version:
MongoDB: mongodb-linux-x86_64-2.4.3
elasticsearch-river-mongodb: 1.6.9
elasticsearch: 0.90
elasticsearch-mapper-attachments: 1.7.0

Problem No. 1


  1. BSON Structure, PDF attachment is in the "FileContent" field, the attachment is not in GridFS.
    byte fileser = iou.read(file);
    Pagecount = getpagenum(file);
    BasicDBObject articleobject = new BasicDBObject();
    articleobject.put("Title", jsonArray.getJSONObject(i).get("Title"));
    articleobject.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
    articleobject.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
    articleobject.put("Media", jsonArray.getJSONObject(i).get("Media"));
    articleobject.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
    articleobject.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
    articleobject.put("Pagecount", Pagecount);
    articleobject.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
    articleobject.put("Keywords", "");
    articleobject.put("FileContent", fileser);
    collection.insert(articleobject);

  1.    create a index
    

curl -XPUT "http://localhost:9200/articleindex"


  1.    create a mapping
    

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"}
}
}
}'


  1. create the river
    

curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch1",
"collection": "pdf"
},
"index": {
"name": "articleindex",
"type": "cardiopathy"
}
}'


  1. Retrieve the indexed document by the keyword
    curl -XGET http://localhost:9200/articleindex/cardiopathy/_search -d'
    {
    "fields" : ["Title"],
    "query" : { "text" : { "FileContent" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
    }
    '

{"took":179,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}

Problem No. 2


alter mapping:

curl -XPUT 'http://localhost:9200/articleindex/cardiopathy/_mapping' -d '
{
"cardiopathy" : {
"file" : {
"properties" : {
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes"},
"FileContent" : {
"type" : "attachment",
"fields" : {
"file" : { "indexAnalyzer": "ik","searchAnalyzer": "ik","store" : "yes", "index" : "analyzed" },
"date" : { "store" : "yes" },
"author" : { "store" : "yes" },
"keywords" : { "store" : "yes" },
"content_type" : { "store" : "yes" },
"title" : { "store" : "yes" }
}
}
}
}
}
}'

Retrieve the indexed document by the keyword:

{"took":63,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":0,"max_score":null,"hits":}}

Problem No. 3


  1.    The attachment is in GridFS, in addition, we define the other fields.
    

GridFSInputFile gfsFile = gfsPhoto.createFile(file);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
gfsFile.setFilename(filename);
gfsFile.put("Title", jsonArray.getJSONObject(i).get("Title"));
gfsFile.put("Authors",jsonArray.getJSONObject(i).get("Authors"));
gfsFile.put("Organization", jsonArray.getJSONObject(i).get("Organization"));
gfsFile.put("Media", jsonArray.getJSONObject(i).get("Media"));
gfsFile.put("ISSN", jsonArray.getJSONObject(i).get("ISSN"));
gfsFile.put("Pages", jsonArray.getJSONObject(i).get("Pages"));
gfsFile.put("Pagecount", Pagecount);
gfsFile.put("Abstracts", jsonArray.getJSONObject(i).get("Abstracts"));
gfsFile.put("Keywords", "");
gfsFile.save();


  1.    create a index
    

curl -XPUT "http://localhost:9200/articleindex"


  1.    create a mapping
    

curl -XPUT 'http://localhost:9200/cardiopathyindex/cardiopathy/_mapping' -d '{
"cardiopathy": {
"properties" : {
"content" : {
"path" : "full",
"type" : "attachment",
"fields" : {
"content" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Authors" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Media" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Organization" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Keywords" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Title" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"ISSN" : {"type": "string","indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Pages" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"Abstracts" : { "type" : "string" ,"indexAnalyzer": "ik","searchAnalyzer": "ik"},
"date" : {"format" : "dateOptionalTime","type" : "date" },
"content_type" : { "type" : "string" }
}
},
"chunkSize" : { "type" : "long" },
"md5" : { "type" : "string" },
"length" : { "type" : "long" },
"filename" : { "type" : "string" },
"contentType" : { "type" : "string" },
"uploadDate" : {
"format" : "dateOptionalTime",
"type" : "date"
},
"metadata" : { "type" : "object" }
}
}
}'


  1. create the river
    

curl -XPUT "http://localhost:9200/_river/mongodb/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "192.168.1.112",
"port": "27107",
"options": {"drop_collection": true },
"db": "ftsearch",
"collection": "fs",
"gridfs": true
},
"index": {
"name": "cardiopathyindex",
"type": "cardiopathy",
"content_type": "application/pdf"
}
}'


  1. Retrieve the indexed document by the keyword, hit, but the query result is missing the "Title" and "Authors" fields.
    curl -XGET http://localhost:9200/cardiopathyindex/cardiopathy/_search -d'
    {
    "fields" : ["Title","Authors"],
    "query" : { "text" : { "content" : "高血压病辨证分型与靶器官相关性研究的新进展" }}
    }
    '
    {"took":1005,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":96,"max_score":0.68918943,"hits":[{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972d5948516489d1674d1","_score":0.68918943},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972db948516489d167545","_score":0.22994329},{"_index":"cardiopathyindex","_type":"cardiopathy","_id":"51d972da948516489d16752c","_score":0.20929527},.......

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
For more options, visit https://groups.google.com/groups/opt_out.