Code indexation mail to elasticsearch


(Rouchad Rouchad) #1

hello, I want to make a filter to pull some information from each email like (sender , @IP , text , name of the company etc ... I heard about grok filter but I can not use it well but since we can work with any computer language I chose PYTHON and I found a code that allows me to pull everything I want from each email. the problem what I want applies this code in logstash to find the same results on the interface kibana
how to do it ?
thank you

there is the code In PYTHON :
import sys, os, re, StringIO
import email, mimetypes

invalid_chars_in_filename='<>:"/\|?*%''+reduce(lambda x,y:x+chr(y), range(32), '')
invalid_windows_name='CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9'.split()

email address REGEX matching the RFC 2822 spec from perlfaq9

my $atom = qr{[a-zA-Z0-9_!#$%&'*+/=?^`{}~|-]+};

my $dot_atom = qr{$atom(?:.$atom)*};

my $quoted = qr{"(?:\[^\r\n]|[^\"])*"};

my $local = qr{(?:$dot_atom|$quoted)};

my $domain_lit = qr{[(?:\\S|[\x21-\x5a\x5e-\x7e])*]};

my $domain = qr{(?:$dot_atom|$domain_lit)};

my $addr_spec = qr{$local@$domain};

Python's translation

atom_rfc2822=r"[a-zA-Z0-9_!#$%&'+/=?^{}~|\-]+" atom_posfix_restricted=r"[a-zA-Z0-9_#\$&'*+/=?\^{}~|-]+" # without '!' and '%'
atom=atom_rfc2822
dot_atom=atom + r"(?:." + atom + ")
"
quoted=r'"(?:\[^\r\n]|[^\"])"'
local="(?:" + dot_atom + "|" + quoted + ")"
domain_lit=r"[(?:\\S|[\x21-\x5a\x5e-\x7e])
]"
domain="(?:" + dot_atom + "|" + domain_lit + ")"
addr_spec=local + "@" + domain

email_address_re=re.compile('^'+addr_spec+'$')

class Attachment:
def init(self, part, filename=None, type=None, payload=None, charset=None, content_id=None, description=None, disposition=None, sanitized_filename=None, is_body=None):
self.part=part # original python part
self.filename=filename # filename in unicode (if any)
self.type=type # the mime-type
self.payload=payload # the MIME decoded content
self.charset=charset # the charset (if any)
self.description=description # if any
self.disposition=disposition # 'inline', 'attachment' or None
self.sanitized_filename=sanitized_filename # cleanup your filename here (TODO)
self.is_body=is_body # usually in (None, 'text/plain' or 'text/html')
self.content_id=content_id # if any
if self.content_id:
# strip '<>' to ease searche and replace in "root" content (TODO)
if self.content_id.startswith('<') and self.content_id.endswith('>'):
self.content_id=self.content_id[1:-1]

def getmailheader(header_text, default="ascii"):
"""Decode header_text if needed"""
try:
headers=email.Header.decode_header(header_text)
except email.Errors.HeaderParseError:
# This already append in email.base64mime.decode()
# instead return a sanitized ascii string
# this faile '=?UTF-8?B?15HXmdeh15jXqNeVINeY15DXpteUINeTJ9eV16jXlSDXkdeg15XXldeUINem15PXpywg15TXptei16bXldei15nXnSDXqdecINek15zXmdeZ?==?UTF-8?B?157XldeR15nXnCwg157Xldek16Ig157Xl9eV15wg15HXodeV15bXnyDXk9ec15DXnCDXldeh15gg157Xl9eR16rXldeqINep15wg15HXmdeQ?==?UTF-8?B?15zXmNeZ?='
return header_text.encode('ascii', 'replace').decode('ascii')
else:
for i, (text, charset) in enumerate(headers):
try:
headers[i]=unicode(text, charset or default, errors='replace')
except LookupError:
# if the charset is unknown, force default
headers[i]=unicode(text, default, errors='replace')
return u"".join(headers)

def getmailaddresses(msg, name):
"""retrieve addresses from header, 'name' supposed to be from, to, ..."""
addrs=email.utils.getaddresses(msg.get_all(name, []))
for i, (name, addr) in enumerate(addrs):
if not name and addr:
# only one string! Is it the address or is it the name ?
# use the same for both and see later
name=addr

    try:
        # address must be ascii only
        addr=addr.encode('ascii')
    except UnicodeError:
        addr=''
    else:
        # address must match address regex
        if not email_address_re.match(addr):
            addr=''
    addrs[i]=(getmailheader(name), addr)
return addrs


(system) #2

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.