Hi everyone !
I'm currently working on a tool with ES and Twitter Streaming API, in
which I try to find interesting profiles on Twitter, based on what they
tweet, RT and which of their interactions are shared/RT.
Anyway, I use ES to index and search among tweets. To do that, I get
Twitter stream data and put in a single index users & tweets (2 types),
linked by the user id via un parent-child relation. Actually, I thought of
my indexing a lot and it is the best way to do it.
- I need to update very often users (because i score them and because they
update their profile quite often), so get the user nested in the tweet is
not an option (too many replicas) - I could put user's tweets directly in the user object but I would have
huge objects and I don't really want that.
I work on a SoYouStart Server, 4c/4t 3.2GHz, 32Go RAM, 4To HDD.
My settings for the index are :
settings = {
"index" : {
"number_of_replicas" : 0, "refresh_interval" : '10s', "routing.allocation.disable_allocation": False },
"analysis": {
"analyzer": { "snowFrench":{ "type": "snowball", "language": "French" }, "snowEnglish":{ "type": "snowball", "language": "English" }, "snowGerman":{ "type": "snowball", "language": "German" }, "snowRussian":{ "type": "snowball", "language": "Russian" }, "snowSpanish":{ "type": "snowball", "language": "Spanish" }, "snowJapanese":{ "type": "snowball", "language": "Japanese" }, "edgeNGramAnalyzer":{ "tokenizer": "myEdgeNGram" }, "name_analyzer": {
"tokenizer": "whitespace",
"type": "custom",
"filter": ["lowercase", "multi_words", "name_filter"]
},
"city_analyzer" : { "type" : "snowball", "language" : "English" } }, "tokenizer" : { "myEdgeNGram" : { "type" : "edgeNGram", "min_gram" : 2, "max_gram" : 5 }, "name_tokenizer": {
"type": "edgeNGram",
"max_gram": 100,
"min_gram": 4
}
}, "filter": {
"multi_words": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 10
},
"name_filter": {
"type": "edgeNGram",
"max_gram": 100,
"min_gram": 4
}
}
}
}
And my mappings are :
tweet_mapping = {
"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "400d"
},
"_parent" : {
"type" : 'user'
},
"properties": {
"textfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'text'
},
"texten": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'text'
},
"textde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'text'
},
"textja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'text'
},
"textru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'text'
},
"textes": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'text'
},
"text": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"entities": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"mentions": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'long',
'precision_step': 64
}
}
},
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
}
}
}
user_mapping = {
"_all" : {
"enabled" : False
},
"_ttl" : {
"enabled" : True,
"default" : "600d"
},
"properties": {
"lang": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"screen_name": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
"_analyzer": "edgeNGramAnalyzer"
},
"descfr": {
'type': 'string',
'_analyzer': 'snowFrench',
'copy_to': 'description'
},
"descen": {
'type': 'string',
'_analyzer': 'snowEnglish',
'copy_to': 'description'
},
"descde": {
'type': 'string',
'_analyzer': 'snowGerman',
'copy_to': 'description'
},
"descja": {
'type': 'string',
'_analyzer': 'snowJapanese',
'copy_to': 'description'
},
"descru": {
'type': 'string',
'_analyzer': 'snowRussian',
'copy_to': 'description'
},
"desces": {
'type': 'string',
'_analyzer': 'snowSpanish',
'copy_to': 'description'
},
"description": {
'type': 'string',
'null_value': '',
'index': 'analyzed',
'store': 'yes'
},
"created_at": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'date',
'format' : 'dd-MM-YYYY HH:mm:ss'
},
"profile_image_url": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string'
},
"analysis": {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
'properties': {
"hashtags": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"relations": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
},
"score": {
'index': 'analyzed',
'store': 'yes',
'type': 'object'
}
}
},
"location" : {
'type': 'object',
'index': 'analyzed',
'store': 'yes',
"properties" : {
"search_field": {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'analyzer': 'city_analyzer',
'null_value': ''
},
"name": {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string',
'null_value': ''
},
"city": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 3.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
},
'alternate_names': {
'boost': 2.0,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
}
}
},
"admin2": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 1.5,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
}
}
},
"admin1": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'boost': 1.2,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'full_name': {
'boost': 1.2,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
}
}
},
"country": {
'index': 'analyzed',
'store': 'yes',
'type': 'object',
'properties': {
'name': {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': ['location.search_field', 'location.name']
},
'fips': {
'index': 'analyzed',
'store': 'yes',
'type': 'string',
'copy_to': 'location.search_field'
},
'capital': {
'index': 'analyzed',
'store': 'yes',
'type': 'string'
}
}
},
"location": {
'index': 'analyzed',
'store': 'yes',
'type': 'geo_point'
},
"population": {
'index': 'analyzed',
'store': 'yes',
'type': 'long'
},
'capital': {
'index': 'not_analyzed',
'store': 'yes',
'type': 'boolean'
}
}
}
}
}
Currently my cluster contains 60M docs (40M tweets, 20M users). I have only
one node, no replicas, because if I create another node, no data will go in
there...
When I index user, I localize them with the string they put in their
profile (actually geoloc in tweet is present 5% of the time so it's not
very interesting). So I indexed (in another index) the biggest cities in
the world and I assign a city for each user.
Something other you should know : I use Python and PyES lib to work.
SO. Let's talk about the problem :
My goal is to sort users by pertinence in their tweets. To do that, I
analyze the user's profile, timeline and the tweets in which they are
mentioned (RT, messages).
So what happens in my script ?
I have a REST API based on Django REST Framework and a frontend with
AngularJS
1/ I type a keyword (for ex : java, python, nodejs) and a location (not
required, for ex: paris)
2/ I use count API to find every user that speaks about "java" in "paris"
3/ Then I get the 20 first results of this query.
4/ I do a multi_search query to get users' timeline and mentions
5/ I score them
6/ When they're scored, AngularJS displays the sorted results and send
another request to the API to score the page 2, until there's no more page
available.
The queries I do are :
1/ To get users :
{
'query': { 'bool': { 'should': [ { 'multi_match': { 'use_dis_max': True, 'query': 'java', 'type': 'boolean', 'operator': 'or', 'fields': [ 'name', 'screen_name', 'description' ] } }, { 'has_child': { 'query': { 'match': { 'text': { 'operator': 'or', 'query': 'java', 'type': 'boolean' } } }, 'type': 'tweet' } } ], 'minimum_number_should_match': 1, 'must': [ { 'function_score': { 'query': { 'match': { 'location.search_field': { 'operator': 'or', 'query': 'paris', 'type': 'boolean' } } }, 'functions': [ { 'script_score': { 'script': "_score *
(doc['capital'].value == 'T' ? 2 : 1)"
} }, { 'script_score': { 'script': "_score *
doc['search_field'].values.size()"
} } ] } } ] } }, 'from': 20, 'size': 20
}
2/ To get timelines and mentions:
{
'query': { 'match': { 'entities.mentions': { 'operator': 'or', 'query': 'userID', 'type': 'boolean' } } }, '_source': True
}
and
{
'query': { 'has_parent': { 'query': { 'match': { 'id': { 'operator': 'or', 'query': 'userID', 'type': 'boolean' } } }, 'type': 'user' } }, '_source': True
}
BUT. Scoring one page can take from a few seconds to several minutes !!! I
don't think it's normal, right ? I profiled my script and this is it : ES
requests take toooooo long. Usually it's something like 10-20sec (and it's
still too long), but sometimes it can take up to 90sec...
I studied quite well ES, I think I understand many things but here, I don't
know what can I do to change that.... Any ideas ? Thanks !
--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/5342d476-eed6-40f9-9fa9-93dde23371b2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.