Hello All
I am trying read, parse and index a html file using the below python script.
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import glob
es=Elasticsearch([{'host':'ip-address','port':9200}])
def remove_tags(html):
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
path = 'path_of_html_file'
files=glob.glob(path)
for file in files:
fname = open(file, 'r')
e1 = remove_tags(fname)
res = es.index(index='ep1',doc_type='employee',id=1,body=e1)
While executing the above script on my linux ec2, i am getting below error.
/usr/local/lib/python2.7/dist-packages/elasticsearch/connection/base.py:208: ElasticsearchWarning: the default number of shards will change from [5] to [1] in 7.0.0; if you wish to continue using the default of [5] shards, you must manage this on the create index request or with an index template
warnings.warn(message, category=ElasticsearchWarning)
Traceback (most recent call last):
File "readMount_Parse_Index.py", line 25, in <module>
res = es.index(index='ep1',doc_type='emp',id=1,body=e1)
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/client/__init__.py", line 411, in index
body=body,
File "/usr/local/lib/python2.7/dist-packages/elasticsearch/transport.py", line 415, in perform_request
raise e
elasticsearch.exceptions.RequestError: RequestError(400, u'mapper_parsing_exception', u'not_x_content_exception: Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes')
Can somebody help me out on this if faced the same issue before.
Thanks!