Elasticsearch Pipeline - Hash Message with sha1

Dear all,

I would like share with you the way found to generate the document_id during the ingest phase via a custom painless script by making the SHA1 hash based on the message field.

The Pipeline test:

POST _ingest/pipeline/_simulate
{
  "pipeline":{
    "processors": [
      {
        "script":{
          "source": "def hex(int num) {def hex_chr = \"0123456789abcdef\".toCharArray();  String str = \"\";  for(int j = 7; j >= 0; j--) str += hex_chr[((num >> (j * 4)) & 15)];  return str;} def str2blks_SHA1(String str){ int nblk = ((str.length() + 8) >> 6) + 1; int[] blks = new int[nblk * 16]; for(int a = 0; a < nblk * 16; a++) blks[a] = 0;int i = 0;for(; i < str.length(); i++) blks[i >> 2] |= str.codePointAt(i) << (24 - (i % 4) * 8); blks[i >> 2] |= 128 << (24 - (i % 4) * 8); blks[nblk * 16 - 1] = str.length() * 8; return blks;} def add(def x, def y){ def lsw = (x & 65535) + (y & 65535); def msw = (x >> 16) + (y >> 16) + (lsw >> 16); return (msw << 16) | (lsw & 65535);} def rol(def num, def cnt){ return (num << cnt) | (num >>> (32 - cnt));} def ft(def t, def b, def c, def d){if(t < 20) return (b & c) | ((~b) & d);if(t < 40) return b ^ c ^ d;if(t < 60) return (b & c) | (b & d) | (c & d);return b ^ c ^ d;} def kt(def t){ return (t < 20) ?  1518500249 : (t < 40) ?  1859775393 : (t < 60) ? -1894007588 : -899497514; } def calcSHA1(def str){ def x = str2blks_SHA1(str); def w = new def[80]; def a = 1732584193; def b = -271733879; def c = -1732584194; def d =  271733878; def e = -1009589776; for(def i = 0; i < x.length; i = i + 16){def olda = a; def oldb = b; def oldc = c; def oldd = d; def olde = e; for(def j = 0; j < 80; j++){if(j < 16) {w[j] = x[i + j];} else {w[j] = rol(w[j-3] ^ w[j-8] ^ w[j-14] ^ w[j-16], 1);} def t = add(add(rol(a, 5), ft(j, b, c, d)), add(add(e, w[j]), kt(j))); e = d; d = c; c = rol(b, 30); b = a; a = t; } a = add(a, olda); b = add(b, oldb); c = add(c, oldc); d = add(d, oldd); e = add(e, olde); } return hex(a) + hex(b) + hex(c) + hex(d) + hex(e);}ctx._id=calcSHA1(ctx.message);"
        }
      }
    ]
  },
  "docs" : [
    { 
      "_source":{        
        "message": "My message to hash"
    }
  }
  ]
}

Result:

{
  "docs": [
    {
      "doc": {
        "_index": "_index",
        "_type": "_type",
        "_id": "5c8c11cf9fc8887e334d0998e6431b5d1bb925c6",
        "_source": {
          "message": "My message to hash"
        },
        "_ingest": {
          "timestamp": "2017-12-12T15:49:15.174Z"
        }
      }
    }
  ]
}

IMPORTANT: Do not forget enabling the painless script on your cluster.

Note: the painless script is a coversion of that JS source code: http://lig-membres.imag.fr/donsez/cours/exemplescourstechnoweb/js_securehash/sha1src.html

Enjoy.

1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.