Facets / OurOfMemorryError / Required RAM


(Loco Jay) #1

Hi,

I m running into OutOfMemoryError on a facet query.

Running es @ 14g :

elasticsearch -f Xmx14g -Xms14g -XX-:HeapDumpOnOutOfMemorryError

Its seems that facets always load the full index of a field into memory. Distributing by splitting the query in smaller chunk (from , size) would not work.

I hoped that changing my field to a multifield and using another tokenizer which strips out numbers... would make this work but
unfortunately it failed.

I guess my only chance is to buy more RAM. but how much ? How can i check how much Memory would be required for a successful call

bellow is the output of the stats api

{
"cluster_name" : "elasticsearch",
"nodes" : {
"LCHFG_l2Qjysf_fGKFrlgA" : {
"name" : "Puma",
"indices" : {
"store" : {
"size" : "79.6gb",
"size_in_bytes" : 85495982520
},
"docs" : {
"count" : 1528884,
"deleted" : 108
},
"indexing" : {
"index_total" : 0,
"index_time" : "0s",
"index_time_in_millis" : 0,
"delete_total" : 0,
"delete_time" : "0s",
"delete_time_in_millis" : 0
},
"get" : {
"total" : 0,
"time" : "0s",
"time_in_millis" : 0,
"exists_total" : 0,
"exists_time" : "0s",
"exists_time_in_millis" : 0,
"missing_total" : 0,
"missing_time" : "0s",
"missing_time_in_millis" : 0
},
"search" : {
"query_total" : 40,
"query_time" : "2s",
"query_time_in_millis" : 2008,
"fetch_total" : 8,
"fetch_time" : "35ms",
"fetch_time_in_millis" : 35
},
"cache" : {
"field_evictions" : 0,
"field_size" : "5.8gb",
"field_size_in_bytes" : 6323033016,
"filter_count" : 0,
"filter_evictions" : 0,
"filter_size" : "0b",
"filter_size_in_bytes" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size" : "0b",
"current_size_in_bytes" : 0,
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0,
"total_docs" : 0,
"total_size" : "0b",
"total_size_in_bytes" : 0
},
"refresh" : {
"total" : 10,
"total_time" : "0s",
"total_time_in_millis" : 0
},
"flush" : {
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0
}
},
"os" : {
"timestamp" : 1321461279333,
"uptime" : "23 minutes and 34 seconds",
"uptime_in_millis" : 1414000,
"load_average" : [ 1.57470703125, 1.248046875, 0.896484375 ],
"cpu" : {
"sys" : 1,
"user" : 5,
"idle" : 93
},
"mem" : {
"free" : "571.1mb",
"free_in_bytes" : 598867968,
"used" : "15.4gb",
"used_in_bytes" : 16581001216,
"free_percent" : 31,
"used_percent" : 68,
"actual_free" : "5gb",
"actual_free_in_bytes" : 5403758592,
"actual_used" : "10.9gb",
"actual_used_in_bytes" : 11776110592
},
"swap" : {
"used" : "28mb",
"used_in_bytes" : 29368320,
"free" : "227.9mb",
"free_in_bytes" : 239067136
}
},
"process" : {
"timestamp" : 1321461279333,
"open_file_descriptors" : 1826,
"cpu" : {
"percent" : 85,
"sys" : "46 seconds and 91 milliseconds",
"sys_in_millis" : 46091,
"user" : "4 minutes, 12 seconds and 994 milliseconds",
"user_in_millis" : 252994,
"total" : "4 minutes, 59 seconds and 85 milliseconds",
"total_in_millis" : 299085
},
"mem" : {
"resident" : "12.3gb",
"resident_in_bytes" : 13241835520,
"share" : "-1b",
"share_in_bytes" : -1,
"total_virtual" : "14.9gb",
"total_virtual_in_bytes" : 16070979584
}
},
"jvm" : {
"timestamp" : 1321461279333,
"uptime" : "5 minutes, 47 seconds and 605 milliseconds",
"uptime_in_millis" : 347605,
"mem" : {
"heap_used" : "6gb",
"heap_used_in_bytes" : 6534761792,
"heap_committed" : "11.9gb",
"heap_committed_in_bytes" : 12856590336,
"non_heap_used" : "31.4mb",
"non_heap_used_in_bytes" : 33028544,
"non_heap_committed" : "52.9mb",
"non_heap_committed_in_bytes" : 55484416
},
"threads" : {
"count" : 37,
"peak_count" : 42
},
"gc" : {
"collection_count" : 139,
"collection_time" : "14 seconds and 275 milliseconds",
"collection_time_in_millis" : 14275,
"collectors" : {
"ParNew" : {
"collection_count" : 97,
"collection_time" : "13 seconds and 383 milliseconds",
"collection_time_in_millis" : 13383
},
"ConcurrentMarkSweep" : {
"collection_count" : 42,
"collection_time" : "892 milliseconds",
"collection_time_in_millis" : 892
}
}
}
},
"network" : {
"tcp" : {
"active_opens" : 306,
"passive_opens" : 55,
"curr_estab" : 67,
"in_segs" : 54832,
"out_segs" : 52636,
"retrans_segs" : 0,
"estab_resets" : 4,
"attempt_fails" : 106,
"in_errs" : 1,
"out_rsts" : -1
}
},
"transport" : {
"server_open" : 7,
"rx_count" : 0,
"rx_size" : "0b",
"rx_size_in_bytes" : 0,
"tx_count" : 0,
"tx_size" : "0b",
"tx_size_in_bytes" : 0
},
"http" : {
"server_open" : 4
}
}
}
}


(Shay Banon) #2

Yes, faceting will require loading all the field values to memory (as does
sorting). Hard to tell how much memory you will need, since it depends on
how many values and how big they are for that field.

On Wed, Nov 16, 2011 at 7:06 PM, LocoJay Dev locojaydev@gmail.com wrote:

Hi,

I m running into OutOfMemoryError on a facet query.

Running es @ 14g :

   elasticsearch -f Xmx14g -Xms14g -XX-:HeapDumpOnOutOfMemorryError

Its seems that facets always load the full index of a field into memory.
Distributing by splitting the query in smaller chunk (from , size) would
not work.

I hoped that changing my field to a multifield and using another tokenizer
which strips out numbers... would make this work but
unfortunately it failed.

I guess my only chance is to buy more RAM. but how much ? How can i check
how much Memory would be required for a successful call

bellow is the output of the stats api

{
"cluster_name" : "elasticsearch",
"nodes" : {
"LCHFG_l2Qjysf_fGKFrlgA" : {
"name" : "Puma",
"indices" : {
"store" : {
"size" : "79.6gb",
"size_in_bytes" : 85495982520
},
"docs" : {
"count" : 1528884,
"deleted" : 108
},
"indexing" : {
"index_total" : 0,
"index_time" : "0s",
"index_time_in_millis" : 0,
"delete_total" : 0,
"delete_time" : "0s",
"delete_time_in_millis" : 0
},
"get" : {
"total" : 0,
"time" : "0s",
"time_in_millis" : 0,
"exists_total" : 0,
"exists_time" : "0s",
"exists_time_in_millis" : 0,
"missing_total" : 0,
"missing_time" : "0s",
"missing_time_in_millis" : 0
},
"search" : {
"query_total" : 40,
"query_time" : "2s",
"query_time_in_millis" : 2008,
"fetch_total" : 8,
"fetch_time" : "35ms",
"fetch_time_in_millis" : 35
},
"cache" : {
"field_evictions" : 0,
"field_size" : "5.8gb",
"field_size_in_bytes" : 6323033016,
"filter_count" : 0,
"filter_evictions" : 0,
"filter_size" : "0b",
"filter_size_in_bytes" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size" : "0b",
"current_size_in_bytes" : 0,
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0,
"total_docs" : 0,
"total_size" : "0b",
"total_size_in_bytes" : 0
},
"refresh" : {
"total" : 10,
"total_time" : "0s",
"total_time_in_millis" : 0
},
"flush" : {
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0
}
},
"os" : {
"timestamp" : 1321461279333,
"uptime" : "23 minutes and 34 seconds",
"uptime_in_millis" : 1414000,
"load_average" : [ 1.57470703125, 1.248046875, 0.896484375 ],
"cpu" : {
"sys" : 1,
"user" : 5,
"idle" : 93
},
"mem" : {
"free" : "571.1mb",
"free_in_bytes" : 598867968,
"used" : "15.4gb",
"used_in_bytes" : 16581001216,
"free_percent" : 31,
"used_percent" : 68,
"actual_free" : "5gb",
"actual_free_in_bytes" : 5403758592,
"actual_used" : "10.9gb",
"actual_used_in_bytes" : 11776110592
},
"swap" : {
"used" : "28mb",
"used_in_bytes" : 29368320,
"free" : "227.9mb",
"free_in_bytes" : 239067136
}
},
"process" : {
"timestamp" : 1321461279333,
"open_file_descriptors" : 1826,
"cpu" : {
"percent" : 85,
"sys" : "46 seconds and 91 milliseconds",
"sys_in_millis" : 46091,
"user" : "4 minutes, 12 seconds and 994 milliseconds",
"user_in_millis" : 252994,
"total" : "4 minutes, 59 seconds and 85 milliseconds",
"total_in_millis" : 299085
},
"mem" : {
"resident" : "12.3gb",
"resident_in_bytes" : 13241835520,
"share" : "-1b",
"share_in_bytes" : -1,
"total_virtual" : "14.9gb",
"total_virtual_in_bytes" : 16070979584
}
},
"jvm" : {
"timestamp" : 1321461279333,
"uptime" : "5 minutes, 47 seconds and 605 milliseconds",
"uptime_in_millis" : 347605,
"mem" : {
"heap_used" : "6gb",
"heap_used_in_bytes" : 6534761792,
"heap_committed" : "11.9gb",
"heap_committed_in_bytes" : 12856590336,
"non_heap_used" : "31.4mb",
"non_heap_used_in_bytes" : 33028544,
"non_heap_committed" : "52.9mb",
"non_heap_committed_in_bytes" : 55484416
},
"threads" : {
"count" : 37,
"peak_count" : 42
},
"gc" : {
"collection_count" : 139,
"collection_time" : "14 seconds and 275 milliseconds",
"collection_time_in_millis" : 14275,
"collectors" : {
"ParNew" : {
"collection_count" : 97,
"collection_time" : "13 seconds and 383 milliseconds",
"collection_time_in_millis" : 13383
},
"ConcurrentMarkSweep" : {
"collection_count" : 42,
"collection_time" : "892 milliseconds",
"collection_time_in_millis" : 892
}
}
}
},
"network" : {
"tcp" : {
"active_opens" : 306,
"passive_opens" : 55,
"curr_estab" : 67,
"in_segs" : 54832,
"out_segs" : 52636,
"retrans_segs" : 0,
"estab_resets" : 4,
"attempt_fails" : 106,
"in_errs" : 1,
"out_rsts" : -1
}
},
"transport" : {
"server_open" : 7,
"rx_count" : 0,
"rx_size" : "0b",
"rx_size_in_bytes" : 0,
"tx_count" : 0,
"tx_size" : "0b",
"tx_size_in_bytes" : 0
},
"http" : {
"server_open" : 4
}
}
}
}


(Karussell) #3

On Nov 17, 1:57 pm, Shay Banon kim...@gmail.com wrote:

Yes, faceting will require loading all the field values to memory (as does
sorting). Hard to tell how much memory you will need, since it depends on
how many values and how big they are for that field.

But couldn't it be reduced when using a filter for (query+)facet?

I hoped that changing my field to a multifield and using another tokenizer which strips out numbers... would make this work but unfortunately it failed.

what failed? did you reindex?

Regards,
Peter.


(Loco Jay) #4

Thanks.

I upgraded my node to run es @ 30g but unfortunately this did not help.
Just to make sure if i define the following for a filed:

    "fulltext": {
        "type": "multi_field",
        "fields":{
            "fulltext":{
                "index": "analyzed",
                "store": "yes",
                "term_vector": "with_positions_offsets",
                "type": "string"
            },
            "cleaned":{
                "index": "analyzed",
                "analyzer":"cleaned",
                "store": "yes",
                "term_vector": "with_positions_offsets",
                "type": "string"

            }
        }
    },

will only fulltext.cleaned be loaded into memory or both fulltext and fulltext.cleaned.
Are there any plans to go to disk or give the ability to do facets on a subseach? I really just need a term:count and luke is not an option.

thanks for helping out.

On Nov 17, 2011, at 7:57 AM, Shay Banon wrote:

Yes, faceting will require loading all the field values to memory (as does sorting). Hard to tell how much memory you will need, since it depends on how many values and how big they are for that field.

On Wed, Nov 16, 2011 at 7:06 PM, LocoJay Dev locojaydev@gmail.com wrote:

Hi,

I m running into OutOfMemoryError on a facet query.

Running es @ 14g :

   elasticsearch -f Xmx14g -Xms14g -XX-:HeapDumpOnOutOfMemorryError

Its seems that facets always load the full index of a field into memory. Distributing by splitting the query in smaller chunk (from , size) would not work.

I hoped that changing my field to a multifield and using another tokenizer which strips out numbers... would make this work but
unfortunately it failed.

I guess my only chance is to buy more RAM. but how much ? How can i check how much Memory would be required for a successful call

bellow is the output of the stats api

{
"cluster_name" : "elasticsearch",
"nodes" : {
"LCHFG_l2Qjysf_fGKFrlgA" : {
"name" : "Puma",
"indices" : {
"store" : {
"size" : "79.6gb",
"size_in_bytes" : 85495982520
},
"docs" : {
"count" : 1528884,
"deleted" : 108
},
"indexing" : {
"index_total" : 0,
"index_time" : "0s",
"index_time_in_millis" : 0,
"delete_total" : 0,
"delete_time" : "0s",
"delete_time_in_millis" : 0
},
"get" : {
"total" : 0,
"time" : "0s",
"time_in_millis" : 0,
"exists_total" : 0,
"exists_time" : "0s",
"exists_time_in_millis" : 0,
"missing_total" : 0,
"missing_time" : "0s",
"missing_time_in_millis" : 0
},
"search" : {
"query_total" : 40,
"query_time" : "2s",
"query_time_in_millis" : 2008,
"fetch_total" : 8,
"fetch_time" : "35ms",
"fetch_time_in_millis" : 35
},
"cache" : {
"field_evictions" : 0,
"field_size" : "5.8gb",
"field_size_in_bytes" : 6323033016,
"filter_count" : 0,
"filter_evictions" : 0,
"filter_size" : "0b",
"filter_size_in_bytes" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size" : "0b",
"current_size_in_bytes" : 0,
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0,
"total_docs" : 0,
"total_size" : "0b",
"total_size_in_bytes" : 0
},
"refresh" : {
"total" : 10,
"total_time" : "0s",
"total_time_in_millis" : 0
},
"flush" : {
"total" : 0,
"total_time" : "0s",
"total_time_in_millis" : 0
}
},
"os" : {
"timestamp" : 1321461279333,
"uptime" : "23 minutes and 34 seconds",
"uptime_in_millis" : 1414000,
"load_average" : [ 1.57470703125, 1.248046875, 0.896484375 ],
"cpu" : {
"sys" : 1,
"user" : 5,
"idle" : 93
},
"mem" : {
"free" : "571.1mb",
"free_in_bytes" : 598867968,
"used" : "15.4gb",
"used_in_bytes" : 16581001216,
"free_percent" : 31,
"used_percent" : 68,
"actual_free" : "5gb",
"actual_free_in_bytes" : 5403758592,
"actual_used" : "10.9gb",
"actual_used_in_bytes" : 11776110592
},
"swap" : {
"used" : "28mb",
"used_in_bytes" : 29368320,
"free" : "227.9mb",
"free_in_bytes" : 239067136
}
},
"process" : {
"timestamp" : 1321461279333,
"open_file_descriptors" : 1826,
"cpu" : {
"percent" : 85,
"sys" : "46 seconds and 91 milliseconds",
"sys_in_millis" : 46091,
"user" : "4 minutes, 12 seconds and 994 milliseconds",
"user_in_millis" : 252994,
"total" : "4 minutes, 59 seconds and 85 milliseconds",
"total_in_millis" : 299085
},
"mem" : {
"resident" : "12.3gb",
"resident_in_bytes" : 13241835520,
"share" : "-1b",
"share_in_bytes" : -1,
"total_virtual" : "14.9gb",
"total_virtual_in_bytes" : 16070979584
}
},
"jvm" : {
"timestamp" : 1321461279333,
"uptime" : "5 minutes, 47 seconds and 605 milliseconds",
"uptime_in_millis" : 347605,
"mem" : {
"heap_used" : "6gb",
"heap_used_in_bytes" : 6534761792,
"heap_committed" : "11.9gb",
"heap_committed_in_bytes" : 12856590336,
"non_heap_used" : "31.4mb",
"non_heap_used_in_bytes" : 33028544,
"non_heap_committed" : "52.9mb",
"non_heap_committed_in_bytes" : 55484416
},
"threads" : {
"count" : 37,
"peak_count" : 42
},
"gc" : {
"collection_count" : 139,
"collection_time" : "14 seconds and 275 milliseconds",
"collection_time_in_millis" : 14275,
"collectors" : {
"ParNew" : {
"collection_count" : 97,
"collection_time" : "13 seconds and 383 milliseconds",
"collection_time_in_millis" : 13383
},
"ConcurrentMarkSweep" : {
"collection_count" : 42,
"collection_time" : "892 milliseconds",
"collection_time_in_millis" : 892
}
}
}
},
"network" : {
"tcp" : {
"active_opens" : 306,
"passive_opens" : 55,
"curr_estab" : 67,
"in_segs" : 54832,
"out_segs" : 52636,
"retrans_segs" : 0,
"estab_resets" : 4,
"attempt_fails" : 106,
"in_errs" : 1,
"out_rsts" : -1
}
},
"transport" : {
"server_open" : 7,
"rx_count" : 0,
"rx_size" : "0b",
"rx_size_in_bytes" : 0,
"tx_count" : 0,
"tx_size" : "0b",
"tx_size_in_bytes" : 0
},
"http" : {
"server_open" : 4
}
}
}
}


(Loco Jay) #5

On Nov 17, 2011, at 3:49 PM, Karussell wrote:

On Nov 17, 1:57 pm, Shay Banon kim...@gmail.com wrote:

Yes, faceting will require loading all the field values to memory (as does
sorting). Hard to tell how much memory you will need, since it depends on
how many values and how big they are for that field.

But couldn't it be reduced when using a filter for (query+)facet?

This is what i m looking for. Its seems that this is not possible

I hoped that changing my field to a multifield and using another tokenizer which strips out numbers... would make this work but unfortunately it failed.

what failed? did you reindex?

I reindexed based on a different tokenizer producing a smaller amount of words but this still failed with a OutofMemoryError after upgrading my node to run es @30g.

   "fulltext": {
        "type": "multi_field",
        "fields":{
            "fulltext":{
                "index": "analyzed",
                "store": "yes",
                "term_vector": "with_positions_offsets",
                "type": "string"
            },
            "cleaned":{
                "index": "analyzed",
                "analyzer":"cleaned",
                "store": "yes",
                "term_vector": "with_positions_offsets",
                "type": "string"

            }
        }
    },

where cleaned :

    "settings":{
        "analysis":
        {
            "analyzer":
            {
                "cleaned":
                {
                    "type":"custom",
                    "filter":["lowercase", "stop"],
                    "tokenizer":"letter"
                }
            }
        }
    }

Regards,
Peter.


(system) #6