Dear all,
I would like share with you the way found to generate the document_id during the ingest phase via a custom painless script by making the SHA1 hash based on the message field.
The Pipeline test:
POST _ingest/pipeline/_simulate
{
"pipeline":{
"processors": [
{
"script":{
"source": "def hex(int num) {def hex_chr = \"0123456789abcdef\".toCharArray(); String str = \"\"; for(int j = 7; j >= 0; j--) str += hex_chr[((num >> (j * 4)) & 15)]; return str;} def str2blks_SHA1(String str){ int nblk = ((str.length() + 8) >> 6) + 1; int[] blks = new int[nblk * 16]; for(int a = 0; a < nblk * 16; a++) blks[a] = 0;int i = 0;for(; i < str.length(); i++) blks[i >> 2] |= str.codePointAt(i) << (24 - (i % 4) * 8); blks[i >> 2] |= 128 << (24 - (i % 4) * 8); blks[nblk * 16 - 1] = str.length() * 8; return blks;} def add(def x, def y){ def lsw = (x & 65535) + (y & 65535); def msw = (x >> 16) + (y >> 16) + (lsw >> 16); return (msw << 16) | (lsw & 65535);} def rol(def num, def cnt){ return (num << cnt) | (num >>> (32 - cnt));} def ft(def t, def b, def c, def d){if(t < 20) return (b & c) | ((~b) & d);if(t < 40) return b ^ c ^ d;if(t < 60) return (b & c) | (b & d) | (c & d);return b ^ c ^ d;} def kt(def t){ return (t < 20) ? 1518500249 : (t < 40) ? 1859775393 : (t < 60) ? -1894007588 : -899497514; } def calcSHA1(def str){ def x = str2blks_SHA1(str); def w = new def[80]; def a = 1732584193; def b = -271733879; def c = -1732584194; def d = 271733878; def e = -1009589776; for(def i = 0; i < x.length; i = i + 16){def olda = a; def oldb = b; def oldc = c; def oldd = d; def olde = e; for(def j = 0; j < 80; j++){if(j < 16) {w[j] = x[i + j];} else {w[j] = rol(w[j-3] ^ w[j-8] ^ w[j-14] ^ w[j-16], 1);} def t = add(add(rol(a, 5), ft(j, b, c, d)), add(add(e, w[j]), kt(j))); e = d; d = c; c = rol(b, 30); b = a; a = t; } a = add(a, olda); b = add(b, oldb); c = add(c, oldc); d = add(d, oldd); e = add(e, olde); } return hex(a) + hex(b) + hex(c) + hex(d) + hex(e);}ctx._id=calcSHA1(ctx.message);"
}
}
]
},
"docs" : [
{
"_source":{
"message": "My message to hash"
}
}
]
}
Result:
{
"docs": [
{
"doc": {
"_index": "_index",
"_type": "_type",
"_id": "5c8c11cf9fc8887e334d0998e6431b5d1bb925c6",
"_source": {
"message": "My message to hash"
},
"_ingest": {
"timestamp": "2017-12-12T15:49:15.174Z"
}
}
}
]
}
IMPORTANT: Do not forget enabling the painless script on your cluster.
Note: the painless script is a coversion of that JS source code: http://lig-membres.imag.fr/donsez/cours/exemplescourstechnoweb/js_securehash/sha1src.html
Enjoy.