Squid access.log > filebeat > logstash > elasticsearch

Hello,

I am a rookie, please bear with me. I need to parse squid3's access.log to elasticsearch, I have searched on the web to no avail.

I am looking for a working example (all latest version es 2.3 etc.) of:
filebeat's configuration installed on the squid3 server, which forwards to logstash server
logstash configurations (input, grok filter and output), which forwards to elasticsearch server
elasticsearch template definition to take the logstash's filtered data for squid3's access.log

Are there working examples, I will need to do the same for apache2, syslog, etc on a larger scale?

Regards,
Ashvin

There are many examples out on the web.

provides a guide for centos, not sure what linux flavor you are on. Most of the guides are for installing ELK on a single host but you can adapt them to multi-host separate function nodes as needed. There are also guides for putting ES behind ssl proxies. Google is your friend. Its better to try configuring a ELK stack and then post specific question if you get stuck.

Hello Michael, thanks for answering. I had followed the same and it is working for syslog.

I'm on Debian 8.2. Below are my configuration files for squid filtering. Grateful if you could point out the error.

root@TEST-ELASTICSEARCH-LOGSTASH1:/etc/logstash/conf.d# cat 02-beats-input.conf
input {
  beats {
    port => 5044
  }
}



root@TEST-ELASTICSEARCH-LOGSTASH1:/etc/logstash/conf.d# cat 10-syslog-filter.conf
filter {
        if [type] == "squid" {
                grok {
                        match => [ "message", "%{NUMBER:timestamp}\s+%{NUMBER:response_time} %{IP:src_ip} %{WORD:squid_request_status}/%{NUMBER:http_status_code} %{NUMBER:reply_size_include_header} %{WORD:http_method} %{WORD:http_protocol}://%{HOSTNAME:dst_host}%{NOTSPACE:request_url} %{NOTSPACE:user} %{WORD:squid}/(?:-|%{IP:dst_ip}) %{NOTSPACE:content_type}" ]
                        add_tag => ["squid"]
                }
#               geoip {
#                       source => "dst_ip"
#               }
        }
}

root@TEST-ELASTICSEARCH-LOGSTASH1:/etc/logstash/conf.d# cat 30-elasticsearch-output.conf
output {
  elasticsearch {
    hosts => ["100.64.111.83:9200"]
    sniffing => true
    manage_template => false
    index => "%{[@metadata][beat]}-%{+YYYY-MM}"
    document_type => "%{[@metadata][type]}"
  }
    if [host] == "PROXY-752010" {
            elasticsearch {
                    hosts => ["http://100.64.111.83:9200"]
                    index => "proxy-752010-%{+YYYY-MM}"
                    document_type => "%{[@metadata][type]}"
            }
    }
}

Elasticsearch's template for squid, I have no idea how to put as timestamp, in squid's access.log it's something like 12315466.123:

   "template_squid": {
      "order": 0,
      "template": "proxy*",
      "settings": {
         "index": {
            "number_of_shards": "6",
            "number_of_replicas": "2"
         }
      },
      "mappings": {
         "access": {
            "properties": {
               "http_protocol": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "squid_request_status": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "http_status_code": {
                  "index": "not_analyzed",
                  "type": "long"
               },
               "request_url": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "src_ip": {
                  "index": "not_analyzed",
                  "type": "ip"
               },
               "reply_size_include_header": {
                  "index": "not_analyzed",
                  "type": "long"
               },
               "http_method": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "content_type": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "dest_ip": {
                  "index": "not_analyzed",
                  "type": "ip"
               },
               "response_time": {
                  "index": "not_analyzed",
                  "type": "long"
               },
               "squid": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "dst_host": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "user": {
                  "index": "not_analyzed",
                  "type": "string"
               },
               "timestamp": {
                  "index": "not_analyzed",
                  "type": "long"
               }
            }
         }
      },
      "aliases": {}
   }
}

I have no idea what's going on, the index is created and I can see the number of docs increasing like 2 docs per second when it should be much more and in logstash log I'm getting this, it's like it's repeating, i have no idea what's happening:

{:timestamp=>"2016-04-07T22:14:22.747000+0000", :message=>"Failed action. ", :status=>400, :action=>["index", {:_id=>nil, :_index=>"proxy-752010-2016-04", :_type=>"squid", :_routing=>nil}, #<LogStash::Event:0x52056430 @metadata_accessors=#<LogStash::Util::Accessors:0x4d16b0bf @store={"beat"=>"filebeat", "type"=>"squid"}, @lut={"[type]"=>[{"beat"=>"filebeat", "type"=>"squid"}, "type"]}>, @cancelled=false, @data={"message"=>"1458749791.243      5 10.35.192.62 TAG_NONE/503 4005 GET https://ww1082.smartadserver.com/imp? - HIER_NONE/- text/html", "@version"=>"1", "@timestamp"=>"2016-04-07T22:14:13.224Z", "beat"=>{"hostname"=>"PROXY-752010", "name"=>"PROXY-752010"}, "count"=>1, "fields"=>nil, "input_type"=>"log", "offset"=>5950646642, "source"=>"/var/log/squid3/access.log", "type"=>"squid", "host"=>"PROXY-752010", "tags"=>["beats_input_codec_plain_applied", "squid"], "timestamp"=>"1458749791.243", "response_time"=>"5", "src_ip"=>"10.35.192.62", "squid_request_status"=>"TAG_NONE", "http_status_code"=>"503", "reply_size_include_header"=>"4005", "http_method"=>"GET", "http_protocol"=>"https", "dst_host"=>"ww1082.smartadserver.com", "request_url"=>"/imp?", "user"=>"-", "squid"=>"HIER_NONE", "content_type"=>"text/html"}, @metadata={"beat"=>"filebeat", "type"=>"squid"}, @accessors=#<LogStash::Util::Accessors:0x2105e7d5 @store={"message"=>"1458749791.243      5 10.35.192.62 TAG_NONE/503 4005 GET https://ww1082.smartadserver.com/imp? - HIER_NONE/- text/html", "@version"=>"1", "@timestamp"=>"2016-04-07T22:14:13.224Z", "beat"=>{"hostname"=>"PROXY-752010", "name"=>"PROXY-752010"}, "count"=>1, "fields"=>nil, "input_type"=>"log", "offset"=>5950646642, "source"=>"/var/log/squid3/access.log", "type"=>"squid", "host"=>"PROXY-752010", "tags"=>["beats_input_codec_plain_applied", "squid"], 

Grateful if you could help :slight_smile:

What I have also noticed is that logstash.log has taken all my hd space 300Gb.

you probably want to leave the syslog fileter to handle syslog and create another filter for squid. You want to use grok to match the content of your squid.log. use https://grokdebug.herokuapp.com/ to help with the pattern. You can add your own pattern to the stock patterns or point it at a dir with your grok_patterns file. A sample file can be:

12-squid.conf:

filter{
   if [type] == "squid" {
      if "_grokparsefailure" in [tags] {
              drop { }
        }
        grok {
                match => [ "message", "%{INT:timestamp}.%{INT}\s*%{NUMBER:request_msec:float} %{IPORHOST:src_ip} %{WORD:cache_result}/%{NUMBER:response_status:int} %{NUMBER:response_size:int} %{WORD:http_method} (%{URIPROTO:http_proto}://)?%{IPORHOST:dst_host}(?::%{POSINT:port})?(?:%{DATA:uri_param})? %{USERNAME:cache_user} %{WORD:request_route}/(%{IPORHOST:forwarded_to}|-) %{GREEDYDATA:content_type}"]
}    
    date {
        match => [ "timestamp", "UNIX" ]
        }
}

what are your setting for logstash logging? What does the logstash.log contain? Are you seeing any indexes in your es? Note that's just an example filter, you may need to modify for your needs and to fit your own log content

Hello,

Thanks for your patience :slight_smile:

It's getting better, from logstash.log I'm getting:

"error"=>{"type"=>"mapper_parsing_exception", "reason"=>"failed to parse [timestamp]", "caused_by"=>{"type"=>"illegal_argument_exception", "reason"=>"Invalid format: \"1458808414.933\" is malformed at \"808414.933\""}}}}, :level=>:warn}

Now this:

"error"=>{"type"=>"mapper_parsing_exception", "reason"=>"failed to parse [src_ip]", "caused_by"=>{"type"=>"number_format_exception", "reason"=>"For input string: \"10.35.141.42\""}}}}, :level=>:warn}

Ok seemingly got it to work, filebeat is pushing squid's access.log from one server to a dedicated logstash server which does the filtering and sends to another elasticsearch server. In logstash's log i'm getting this:

{:timestamp=>"2016-04-11T13:51:47.251000+0000", :message=>"Beats input: The circuit breaker has detected a slowdown or stall in the pipeline, the input is closing the current connection and rejecting new connection until the pipeline recover.", :exception=>LogStash::Inputs::BeatsSupport::CircuitBreaker::HalfOpenBreaker, :level=>:warn} 
{:timestamp=>"2016-04-11T13:50:08.371000+0000", :message=>"Beats input: the pipeline is blocked, temporary refusing new connection.", :reconnect_backoff_sleep=>0.5, :level=>:warn}
{:timestamp=>"2016-04-11T13:50:08.871000+0000", :message=>"Beats input: the pipeline is blocked, temporary refusing new connection.", :reconnect_backoff_sleep=>0.5, :level=>:warn}
...

Filebeat log gives:

2016-04-11T15:54:22+02:00 DBG  Try to publish 746 events to logstash with window size 17
2016-04-11T15:54:22+02:00 DBG  close connection
2016-04-11T15:54:22+02:00 DBG  0 events out of 746 events sent to logstash. Continue sending ...
2016-04-11T15:54:22+02:00 INFO Error publishing events (retrying): lumberjack protocol error
2016-04-11T15:54:22+02:00 INFO send fail
2016-04-11T15:54:22+02:00 INFO backoff retry: 1m0s

I have increased LS_HEAP_SIZE to "1536m", the problem persists. Any idea?

See if the logs are making it to the elasticsearch server, is there an index with your stuff in it? Take a look in kibana? Based on your logs it seem that logstash queue is full and can't accept any additional input from beats. How much are you sending to logstash? if its able to send them to ES , logstash should be able to keep up with a single squid client easlly.

Yes they are, I have increased my server's memory to 8Gb and Heap Size to 4Gb. I am sending from one squid server. Same error message appears in the log.

Adding congestion_threshold => 10 in beats gives a better result. No errors and warning in the logs, for the time being :stuck_out_tongue:

My filter/mapping is not parsing, in Kibana:

message:1459522385.314 167 10.35.32.27 TCP_MISS/200 5183 POST http://www.jazzradio.fr/winradio/prog10.xml default ORIGINAL_DST/84.16.76.183 text/xml @version:1 @timestamp:April 12th 2016, 12:17:02.146 beat.hostname:PROXY-752010 beat.name:PROXY-752010 count:1 fields: - input_type:log offset:7,040,470,011 source:/var/log/squid3/access.log type:squid host:PROXY-752010 tags:beats_input_codec_plain_applied, _grokparsefailure _id:AVQJjC_j1HD874OS1YQA _type:squid _index:proxy-752010-2016-04 _score:

Need to start from scratch :frowning:

Let's start simple, this is my filter:

%{INT:timestamp}.%{INT:timestamp_ms}\s+%{INT:response_time} %{IPV4:src_ip} %{WORD:squid_request_status}/%{INT:http_status_code} %{INT:reply_size_include_header} %{WORD:http_method} %{NOTSPACE:request_url} %{NOTSPACE:user} %{WORD:squid}/%{IPV4:server_ip} %{NOTSPACE:content_type}

This is my elasticsearch template:

PUT /_template/template_squid
{
  "template": "proxy*",
  "settings": {
    "number_of_shards": "6",
    "number_of_replicas": 2
  },
  "mappings": {
    "squid": {
      "properties": {
        "timestamp": {
          "type": "long",
          "index": "not_analyzed"
        },
        "timestamp_ms": {
            "type": "long",
            "index": "not_analyzed"
        },
        "response_time": {
          "type": "long",
          "index": "not_analyzed"
        },
        "src_ip": {
          "type": "ip",
          "index": "not_analyzed"
        },
        "squid_request_status": {
          "type": "string",
          "index": "not_analyzed"
        },
        "http_status_code": {
          "type": "long",
          "index": "not_analyzed"
        },
        "reply_size_include_header": {
          "type": "long",
          "index": "not_analyzed"
        },
        "http_method": {
          "type": "string",
          "index": "not_analyzed"
        },
        "request_url": {
          "type": "string",
          "index": "not_analyzed"
        },
        "user": {
          "type": "string",
          "index": "not_analyzed"
        },
        "squid": {
          "type": "string",
          "index": "not_analyzed"
        },
        "dest_ip": {
          "type": "ip",
          "index": "not_analyzed"
        },
        "content_type": {
          "type": "string",
          "index": "not_analyzed"
        }
      }
    }
  }
}

Kibana gives:

{
  "_index": "proxy-752010-2016-04",
  "_type": "squid",
  "_id": "AVQJ9iS41HD874OS5Omq",
  "_score": null,
  "_source": {
    "message": "1459764056.048    351 10.35.25.25 TCP_MISS/200 25603 GET https://fonts.gstatic.com/s/raleway/v10/CcKI4k9un7TZVWzRVT-T8xsxEYwM7FgeyaSgU71cLG0.woff default ORIGINAL_DST/216.58.208.227 font/woff",
    "@version": "1",
    "@timestamp": "2016-04-12T10:12:51.321Z",
    "beat": {
      "hostname": "PROXY-752010",
      "name": "PROXY-752010"
    },
    "count": 1,
    "fields": null,
    "input_type": "log",
    "offset": 7159806464,
    "source": "/var/log/squid3/access.log",
    "type": "squid",
    "host": "PROXY-752010",
    "tags": [
      "beats_input_codec_plain_applied",
      "_grokparsefailure"
    ]
  },
  "fields": {
    "@timestamp": [
      1460455971321
    ]
  },
  "sort": [
    1460455971321
  ]
}

Can you point out my mistake please?

the grokparsefailure means that the grok pattern you are using isn't matching properly. So you need to go back and tweak the grok filter. Start simple:

%{INT:timestamp} %{GREEDYDATA:message}

then add more grok filter and fields, testing with logstash output to stdout and not to ES. Slowly add each filter until you find the one that's not working and fix that filter.

The thing is that I have checked the filter with grok debug with multiple entries and it's good. The stdout is not complaining at at all:

{
"@version" => "1",
"@timestamp" => "2016-04-13T05:25:24.000Z",
"beat" => {
"hostname" => "PROXY-752010",
"name" => "PROXY-752010"
},
"count" => 1,
"input_type" => "log",
"offset" => 8407994036,
"source" => "/var/log/squid3/access.log",
"type" => "squid",
"host" => "PROXY-752010",
"tags" => [
[0] "beats_input_codec_plain_applied"
],
"timestamp" => "1460525124",
"timestamp_ms" => "720",
"response_time" => "9",
"src_ip" => "10.35.75.58",
"squid_request_status" => "TCP_MISS",
"http_status_code" => "200",
"reply_size_include_header" => "573",
"http_method" => "GET",
"request_url" => "https://safebrowsing-cache.google.com/safebrowsing/rd/ChRnb29nLXVud2FudGVkLXNoYXZhcjgBQAJKDAgBEPrnAxj85wMgAUoMCAAQv5cEGMCXBCAB",
"user" => "default",
"squid" => "ORIGINAL_DST",
"server_ip" => "172.217.16.78",
"content_type" => "application/vnd.google.safebrowsing-chunk"
}