Skip to content
Extraits de code Groupes Projets
Valider 8bcfd9f1 rédigé par Fabian Becker's avatar Fabian Becker
Parcourir les fichiers

Merge pull request #120 from gregorg/master

Add JsonFormat to handle json logs without regex parsing.
parents 6a4284b2 8e4a7ef5
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -192,3 +192,68 @@ Apache LogFormat "%h %l %u %t \"%r\" %>s %b %D"
Note: the group <generation_time_milli> is also available if your server logs generation time in milliseconds rather than microseconds.
## Setup Nginx to directly imports in Piwik via syslog
With the syslog patch from http://wiki.nginx.org/3rdPartyModules which is compiled in dotdeb's release, you can log to syslog and imports them live to Piwik.
Path: Nginx -> syslog -> (syslog central server) -> this script -> piwik
You can use any log format that this script can handle, like Apache Combined, and Json format which needs less processing.
### Setup Nginx logs
```
http {
...
log_format piwik '{"ip": "$remote_addr",'
'"host": "$host",'
'"path": "$request_uri",'
'"status": "$status",'
'"referrer": "$http_referer",'
'"user_agent": "$http_user_agent",'
'"length": $bytes_sent,'
'"generation_time_milli": $request_time,'
'"date": "$time_iso8601"}';
...
server {
...
access_log syslog:info piwik;
...
}
}
```
# Setup syslog-ng
This is the config for the central server if any. If not, you can also use this config on the same server as Nginx.
```
options {
stats_freq(600); stats_level(1);
log_fifo_size(1280000);
log_msg_size(8192);
};
source s_nginx { udp(); };
destination d_piwik {
program("/usr/local/piwik/piwik.sh" template("$MSG\n"));
};
log { source(s_nginx); filter(f_info); destination(d_piwik); };
```
# piwik.sh
Just needed to configure the best params for import_logs.py :
```
#!/bin/sh
exec python /path/to/misc/log-analytics/import_logs.py \
--url=http://localhost/ --token-auth=<your_auth_token> \
--idsite=1 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots \
--log-format-name=nginx_json -
```
And that's all !
#!/usr/bin/python
# vim: et sw=4 ts=4:
# -*- coding: utf-8 -*-
#
# Piwik - Open source web analytics
......@@ -104,13 +105,13 @@ PIWIK_EXPECTED_IMAGE = base64.b64decode(
## Formats.
##
class RegexFormat(object):
class BaseFormatException(Exception): pass
def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'):
class BaseFormat(object):
def __init__(self, name):
self.name = name
if regex is not None:
self.regex = re.compile(regex)
self.date_format = date_format
self.regex = None
self.date_format = '%d/%b/%Y:%H:%M:%S'
def check_format(self, file):
line = file.readline()
......@@ -118,7 +119,77 @@ class RegexFormat(object):
return self.check_format_line(line)
def check_format_line(self, line):
return re.match(self.regex, line)
return False
class JsonFormat(BaseFormat):
def __init__(self, name):
super(JsonFormat, self).__init__(name)
self.json = None
self.date_format = '%Y-%m-%dT%H:%M:%S'
def check_format_line(self, line):
try:
self.json = json.loads(line)
return True
except:
return False
def match(self, line):
try:
self.json = json.loads(line)
return self
except:
self.json = None
return None
def get(self, key):
# Some ugly patchs ...
if key == 'generation_time_milli':
self.json[key] = int(self.json[key] * 1000)
# Patch date format ISO 8601
elif key == 'date':
tz = self.json[key][19:]
self.json['timezone'] = tz.replace(':', '')
self.json[key] = self.json[key][:19]
try:
return self.json[key]
except KeyError:
raise BaseFormatException()
def get_all(self,):
return self.json
class RegexFormat(BaseFormat):
def __init__(self, name, regex, date_format=None):
super(RegexFormat, self).__init__(name)
if regex is not None:
self.regex = re.compile(regex)
if date_format is not None:
self.date_format = date_format
self.matched = None
def check_format_line(self, line):
return self.match(line)
def match(self,line):
self.matched = self.regex.match(line)
return self.matched
def get(self, key):
try:
return self.matched.group(key)
except IndexError:
raise BaseFormatException()
def get_all(self,):
return self.matched.groupdict()
class IisFormat(RegexFormat):
......@@ -191,6 +262,7 @@ FORMATS = {
'iis': IisFormat(),
's3': RegexFormat('s3', _S3_LOG_FORMAT),
'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
'nginx_json': JsonFormat('nginx_json'),
}
......@@ -1339,15 +1411,21 @@ class Parser(object):
format = None
format_groups = 0
for name, candidate_format in FORMATS.iteritems():
logging.debug("Check format %s", name)
match = candidate_format.check_format(file)
if match:
logging.debug('Format %s matches', name)
# if there's more info in this match, use this format
match_groups = len(match.groups())
if format_groups < match_groups:
# compare format groups if this *BaseFormat has groups() method
try:
# if there's more info in this match, use this format
match_groups = len(match.groups())
if format_groups < match_groups:
format = candidate_format
format_groups = match_groups
except AttributeError:
format = candidate_format
format_groups = match_groups
else:
logging.debug('Format %s does not match', name)
......@@ -1418,7 +1496,7 @@ class Parser(object):
if stats.count_lines_parsed.value <= config.options.skip:
continue
match = format.regex.match(line)
match = format.match(line)
if not match:
invalid_line(line, 'line did not match')
continue
......@@ -1426,8 +1504,8 @@ class Parser(object):
hit = Hit(
filename=filename,
lineno=lineno,
status=match.group('status'),
full_path=match.group('path'),
status=format.get('status'),
full_path=format.get('path'),
is_download=False,
is_robot=False,
is_error=False,
......@@ -1436,44 +1514,44 @@ class Parser(object):
)
try:
hit.query_string = match.group('query_string')
hit.query_string = format.get('query_string')
hit.path = hit.full_path
except IndexError:
except BaseFormatException:
hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)
try:
hit.referrer = match.group('referrer')
except IndexError:
hit.referrer = format.get('referrer')
except BaseFormatException:
hit.referrer = ''
if hit.referrer == '-':
hit.referrer = ''
try:
hit.user_agent = match.group('user_agent')
except IndexError:
hit.user_agent = format.get('user_agent')
except BaseFormatException:
hit.user_agent = ''
hit.ip = match.group('ip')
hit.ip = format.get('ip')
try:
hit.length = int(match.group('length'))
except (ValueError, IndexError):
hit.length = int(format.get('length'))
except BaseFormatException:
# Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
hit.length = 0
try:
hit.generation_time_milli = int(match.group('generation_time_milli'))
except IndexError:
hit.generation_time_milli = int(format.get('generation_time_milli'))
except BaseFormatException:
try:
hit.generation_time_milli = int(match.group('generation_time_micro')) / 1000
except IndexError:
hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000
except BaseFormatException:
hit.generation_time_milli = 0
if config.options.log_hostname:
hit.host = config.options.log_hostname
else:
try:
hit.host = match.group('host').lower().strip('.')
except IndexError:
hit.host = format.get('host').lower().strip('.')
except BaseFormatException:
# Some formats have no host.
pass
......@@ -1484,7 +1562,7 @@ class Parser(object):
# Parse date.
# We parse it after calling check_methods as it's quite CPU hungry, and
# we want to avoid that cost for excluded hits.
date_string = match.group('date')
date_string = format.get('date')
try:
hit.date = datetime.datetime.strptime(date_string, format.date_format)
except ValueError:
......@@ -1493,8 +1571,8 @@ class Parser(object):
# Parse timezone and substract its value from the date
try:
timezone = float(match.group('timezone'))
except IndexError:
timezone = float(format.get('timezone'))
except BaseFormatException:
timezone = 0
except ValueError:
invalid_line(line, 'invalid timezone')
......
{"idsite":1,"ip": "203.38.78.246","host": "www.piwik.org","path": "/piwik.php?action_name=Clearcode%20-%20Web%20and%20Mobile%20Development%20%7C%20Technology%20With%20Passion&idsite=1&rec=1&r=983420&h=17&m=31&s=25&url=http%3A%2F%2Fclearcode.cc%2F&urlref=http%3A%2F%2Fclearcode.cc%2Fwelcome&_id=1da79fc743e8bcc4&_idts=1360047661&_idvc=1&_idn=0&_refts=1360047661&_viewts=1360047661&_ref=http%3A%2F%2Fpiwik.org%2Fthank-you-all%2F&pdf=1&qt=1&realp=0&wma=1&dir=1&fla=1&java=1&gears=0&ag=1&cookie=1&res=1680x1050","status": "200","referrer": "http://clearcode.cc/","user_agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17","length": 192,"generation_time_milli": 0.008,"date": "2013-10-10T16:52:00+02:00"}
{"idsite":1,"ip": "203.38.78.246","host": "www.piwik.org","path": "/piwik.php?action_name=AdviserBrief%20-%20Track%20Your%20Investments%20and%20Plan%20Financial%20Future%20%7C%20Clearcode&idsite=1&rec=1&r=109464&h=17&m=31&s=40&url=http%3A%2F%2Fclearcode.cc%2Fcase%2Fadviserbrief-track-your-investments-and-plan-financial-future%2F&urlref=http%3A%2F%2Fclearcode.cc%2Fwelcome&_id=1da79fc743e8bcc4&_idts=1360047661&_idvc=1&_idn=0&_refts=1360047661&_viewts=1360047661&_ref=http%3A%2F%2Fpiwik.org%2Fthank-you-all%2F&pdf=1&qt=1&realp=0&wma=1&dir=1&fla=1&java=1&gears=0&ag=1&cookie=1&res=1680x1050","status": "200","referrer": "http://clearcode.cc/","user_agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17","length": 192,"generation_time_milli": 0.008,"date": "2013-10-10T16:52:00+02:00"}
#!/bin/sh
cd $(dirname $0)
# Make sure nosetests is installed.
nosetests -V >/dev/null 2>&1 || (echo "nose (http://readthedocs.org/docs/nose/en/latest/) must be installed"; exit 1)
......
# vim: et sw=4 ts=4:
import functools
import os
......@@ -186,7 +187,7 @@ def parse_log_file_line(format_name, file_):
match = format.check_format(file)
file.close()
return match.groupdict()
return format.get_all()
# check parsing groups
def check_common_groups(groups):
......@@ -239,6 +240,14 @@ def check_s3_groups(groups):
assert groups['length'] == '368'
assert groups['referrer'] == '-'
assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
def check_nginx_json_groups(groups):
assert groups['host'] == 'www.piwik.org'
assert groups['status'] == '200'
assert groups['ip'] == '203.38.78.246'
assert groups['length'] == 192
assert groups['user_agent'] == 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
assert groups['date'] == '2013-10-10T16:52:00+02:00'
def check_match_groups(format_name, groups):
symbols = globals()
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter