Enhancing perf for my cluster

pbocto · August 18, 2014, 10:29am

Hi everyone !

I'm currently working on a tool with ES and Twitter Streaming API, in
which I try to find interesting profiles on Twitter, based on what they
tweet, RT and which of their interactions are shared/RT.

Anyway, I use ES to index and search among tweets. To do that, I get
Twitter stream data and put in a single index users & tweets (2 types),
linked by the user id via un parent-child relation. Actually, I thought of
my indexing a lot and it is the best way to do it.

I need to update very often users (because i score them and because they
update their profile quite often), so get the user nested in the tweet is
not an option (too many replicas)
I could put user's tweets directly in the user object but I would have
huge objects and I don't really want that.

I work on a SoYouStart Server, 4c/4t 3.2GHz, 32Go RAM, 4To HDD.

My settings for the index are :

settings = {

"index" : {

    "number_of_replicas" : 0,

    "refresh_interval" : '10s',

    "routing.allocation.disable_allocation": False

},

"analysis": {

    "analyzer": {

        "snowFrench":{

            "type": "snowball",

            "language": "French"

        },

        "snowEnglish":{

            "type": "snowball",

            "language": "English"

        },

        "snowGerman":{

            "type": "snowball",

            "language": "German"

        },

        "snowRussian":{

            "type": "snowball",

            "language": "Russian"

        },

        "snowSpanish":{

            "type": "snowball",

            "language": "Spanish"

        },

        "snowJapanese":{

            "type": "snowball",

            "language": "Japanese"

        },

        "edgeNGramAnalyzer":{

        "tokenizer": "myEdgeNGram"

        },

        "name_analyzer": {

"tokenizer": "whitespace",

"type": "custom",

"filter": ["lowercase", "multi_words", "name_filter"]

},

        "city_analyzer" : {

            "type" : "snowball",

            "language" : "English"

        }

    },

    "tokenizer" : {

        "myEdgeNGram" : {

            "type" : "edgeNGram",

            "min_gram" : 2,

            "max_gram" : 5

        },

        "name_tokenizer": {

"type": "edgeNGram",

"max_gram": 100,

"min_gram": 4

}

    },

    "filter": {

"multi_words": {

"type": "shingle",

"min_shingle_size": 2,

"max_shingle_size": 10

},

"name_filter": {

"type": "edgeNGram",

"max_gram": 100,

"min_gram": 4

}

And my mappings are :

tweet_mapping = {

"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "400d"
},
"_parent" : {
"type" : 'user'
},
"properties": {
"textfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'text'
},
"texten": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'text'
},
"textde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'text'
},
"textja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'text'
},
"textru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'text'
},
"textes": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'text'
},
"text": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"entities": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"mentions": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'long',
'precision_step': 64
}
}
},
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
}
}
}
user_mapping = {
"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "600d"
},
"properties": {
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"screen_name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"descfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'description'
},
"descen": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'description'
},
"descde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'description'
},
"descja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'description'
},
"descru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'description'
},
"desces": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'description'
},
"description": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
},
"profile_image_url": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"analysis": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"relations": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"score": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
}
}
},
"location" : {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
"properties" : {
"search_field": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'analyzer': 'city_analyzer',
'null_value': ''
},
"name": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string',
'null_value': ''
},
"city": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
},
'alternate_names': {
'boost': 2.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
}
}
},
"admin2": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
}
}
},
"admin1": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.2,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 1.2,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
}
}
},
"country": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
},
'fips': {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'capital': {
'index': 'analyzed',
'store': 'yes',
'type': 'string'
}
}
},
"location": {
'index': 'analyzed',
'store': 'yes',
'type': 'geo_point'
},
"population": {
'index': 'analyzed',
'store': 'yes',
'type': 'long'
},
'capital': {
'index': 'not_analyzed',
'store': 'yes',
'type': 'boolean'
}
}
}
}
}

Currently my cluster contains 60M docs (40M tweets, 20M users). I have only
one node, no replicas, because if I create another node, no data will go in
there...

When I index user, I localize them with the string they put in their
profile (actually geoloc in tweet is present 5% of the time so it's not
very interesting). So I indexed (in another index) the biggest cities in
the world and I assign a city for each user.

Something other you should know : I use Python and PyES lib to work.

SO. Let's talk about the problem :

My goal is to sort users by pertinence in their tweets. To do that, I
analyze the user's profile, timeline and the tweets in which they are
mentioned (RT, messages).
So what happens in my script ?

I have a REST API based on Django REST Framework and a frontend with
AngularJS

1/ I type a keyword (for ex : java, python, nodejs) and a location (not
required, for ex: paris)
2/ I use count API to find every user that speaks about "java" in "paris"
3/ Then I get the 20 first results of this query.
4/ I do a multi_search query to get users' timeline and mentions
5/ I score them
6/ When they're scored, AngularJS displays the sorted results and send
another request to the API to score the page 2, until there's no more page
available.

The queries I do are :
1/ To get users :

{

'query': {

    'bool': {

        'should': [

            {

                'multi_match': {

                    'use_dis_max': True,

                    'query': 'java',

                    'type': 'boolean',

                    'operator': 'or',

                    'fields': [

                        'name',

                        'screen_name',

                        'description'

                    ]

                }

            },

            {

                'has_child': {

                    'query': {

                        'match': {

                            'text': {

                                'operator': 'or',

                                'query': 'java',

                                'type': 'boolean'

                            }

                        }

                    },

                    'type': 'tweet'

                }

            }

        ],

        'minimum_number_should_match': 1,

        'must': [

            {

                'function_score': {

                    'query': {

                        'match': {

                            'location.search_field': {

                                'operator': 'or',

                                'query': 'paris',

                                'type': 'boolean'

                            }

                        }

                    },

                    'functions': [

                        {

                            'script_score': {

                                'script': "_score *

(doc['capital'].value == 'T' ? 2 : 1)"

                            }

                        },

                        {

                            'script_score': {

                                'script': "_score *

doc['search_field'].values.size()"

                            }

                        }

                    ]

                }

            }

        ]

    }

},

'from': 20,

'size': 20

}

2/ To get timelines and mentions:

{

'query': {
    'match': {
        'entities.mentions': {
            'operator': 'or',
            'query': 'userID',
            'type': 'boolean'
        }
    }
},
'_source': True

}

and

{

'query': {

    'has_parent': {

        'query': {

            'match': {

                'id': {

                    'operator': 'or',

                    'query': 'userID',

                    'type': 'boolean'

                }

            }

        },

        'type': 'user'

    }

},

'_source': True

}

BUT. Scoring one page can take from a few seconds to several minutes !!! I
don't think it's normal, right ? I profiled my script and this is it : ES
requests take toooooo long. Usually it's something like 10-20sec (and it's
still too long), but sometimes it can take up to 90sec...

I studied quite well ES, I think I understand many things but here, I don't
know what can I do to change that.... Any ideas ? Thanks !

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/5342d476-eed6-40f9-9fa9-93dde23371b2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

pbocto · August 18, 2014, 4:41pm

Hey guys,

Finally i changed all my queries to constantscorequeries. It's way better,
but still, certain pages take a lot of time running... I don't understand
why, and i don't have anything in my ES logs...

Now the average time for search 20 users and their mentions/timeline +
scoring them is about 4s (and almost 4s for the search).
But when it takes time, it's still 60s for 1 page !!!

I tried reading the explain data i can't get after the query but there's no
response time. How can I find a way to understand why certain queries take
so much time ?

Thanks !

Le lundi 18 août 2014 12:29:10 UTC+2, Pierrick Boutruche a écrit :

Hi everyone !

I'm currently working on a tool with ES and Twitter Streaming API, in
which I try to find interesting profiles on Twitter, based on what they
tweet, RT and which of their interactions are shared/RT.

Anyway, I use ES to index and search among tweets. To do that, I get
Twitter stream data and put in a single index users & tweets (2 types),
linked by the user id via un parent-child relation. Actually, I thought of
my indexing a lot and it is the best way to do it.

I need to update very often users (because i score them and because they
update their profile quite often), so get the user nested in the tweet is
not an option (too many replicas)

I could put user's tweets directly in the user object but I would have
huge objects and I don't really want that.

I work on a SoYouStart Server, 4c/4t 3.2GHz, 32Go RAM, 4To HDD.

My settings for the index are :

settings = {

"index" : {
    "number_of_replicas" : 0,

    "refresh_interval" : '10s',

    "routing.allocation.disable_allocation": False

},
"analysis": {
    "analyzer": {

        "snowFrench":{

            "type": "snowball",

            "language": "French"

        },

        "snowEnglish":{

            "type": "snowball",

            "language": "English"

        },

        "snowGerman":{

            "type": "snowball",

            "language": "German"

        },

        "snowRussian":{

            "type": "snowball",

            "language": "Russian"

        },

        "snowSpanish":{

            "type": "snowball",

            "language": "Spanish"

        },

        "snowJapanese":{

            "type": "snowball",

            "language": "Japanese"

        },

        "edgeNGramAnalyzer":{

        "tokenizer": "myEdgeNGram"

        },

        "name_analyzer": {
"tokenizer": "whitespace",

"type": "custom",

"filter": ["lowercase", "multi_words", "name_filter"]

},
        "city_analyzer" : {

            "type" : "snowball",

            "language" : "English"

        }

    },

    "tokenizer" : {

        "myEdgeNGram" : {

            "type" : "edgeNGram",

            "min_gram" : 2,

            "max_gram" : 5

        },

        "name_tokenizer": {
"type": "edgeNGram",

"max_gram": 100,

"min_gram": 4

}
    },

    "filter": {
"multi_words": {

"type": "shingle",

"min_shingle_size": 2,

"max_shingle_size": 10

},

"name_filter": {

"type": "edgeNGram",

"max_gram": 100,

"min_gram": 4

}

}
}
}

And my mappings are :

tweet_mapping = {

"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "400d"
},
"_parent" : {
"type" : 'user'
},
"properties": {
"textfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'text'
},
"texten": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'text'
},
"textde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'text'
},
"textja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'text'
},
"textru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'text'
},
"textes": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'text'
},
"text": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"entities": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"mentions": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'long',
'precision_step': 64
}
}
},
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
}
}
}
user_mapping = {
"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "600d"
},
"properties": {
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"screen_name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"descfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'description'
},
"descen": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'description'
},
"descde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'description'
},
"descja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'description'
},
"descru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'description'
},
"desces": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'description'
},
"description": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
},
"profile_image_url": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"analysis": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"relations": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"score": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
}
}
},
"location" : {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
"properties" : {
"search_field": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'analyzer': 'city_analyzer',
'null_value': ''
},
"name": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string',
'null_value': ''
},
"city": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
},
'alternate_names': {
'boost': 2.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
}
}
},
"admin2": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
}
}
},
"admin1": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.2,
'index': 'analyzed',
'store': 'yes',
'type': 'string',

...

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/7875dd94-8b2a-405e-aaf4-eeb3c21bd53b%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Topic		Replies	Views
[Discussion] I have two options for designing my indexes, which one do you thinks will be better? Elasticsearch	6	414	September 22, 2017
Search in a given index/type but filter by another type Elasticsearch	3	291	July 6, 2017
Looking for my "ah ha!" moment Elasticsearch	4	480	July 6, 2017
Indexing Criteria Elasticsearch	2	334	July 5, 2017
Just Pushed: Indexer Support + Twtiter indexer plugin Elasticsearch	5	308	July 6, 2017

Enhancing perf for my cluster

Related topics