Skip to content
Extraits de code Groupes Projets
Valider 62b43d88 rédigé par diosmosis's avatar diosmosis
Parcourir les fichiers

Fixes #3805, reverted change in log importer that looked for end-of-line after...

Fixes #3805, reverted change in log importer that looked for end-of-line after format regex match and modified format autodetection logic to pick the format based on whether the format matches and the number of groups returned in the match.

Notes:
  * Added several more tests to log importer tests.py. Added tests for checking format of log files w/ extra junk info on log lines. Added individual tests for parsing regex format.
  * Modified log files used in ImportLogs test, added extra junk info to end of some lines.
  * Fixed failing test in tests.py for the S3 log file format.
parent 1d1a2e47
Branches
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -108,17 +108,23 @@ class RegexFormat(object): ...@@ -108,17 +108,23 @@ class RegexFormat(object):
def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'): def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'):
self.name = name self.name = name
self.regex = re.compile(regex + '\s*$') # make sure regex includes end of line if regex is not None:
self.regex = re.compile(regex)
self.date_format = date_format self.date_format = date_format
def check_format(self, file): def check_format(self, file):
line = file.readline() line = file.readline()
file.seek(0) file.seek(0)
if re.match(self.regex, line): return self.check_format_line(line)
return self
def check_format_line(self, line):
return re.match(self.regex, line)
class IisFormat(RegexFormat):
class IisFormat(object): def __init__(self):
super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S')
def check_format(self, file): def check_format(self, file):
line = file.readline() line = file.readline()
...@@ -151,7 +157,12 @@ class IisFormat(object): ...@@ -151,7 +157,12 @@ class IisFormat(object):
except KeyError: except KeyError:
regex = '\S+' regex = '\S+'
full_regex.append(regex) full_regex.append(regex)
return RegexFormat('iis', ' '.join(full_regex), '%Y-%m-%d %H:%M:%S') self.regex = re.compile(' '.join(full_regex))
start_pos = file.tell()
nextline = file.readline()
file.seek(start_pos)
return self.check_format_line(nextline)
...@@ -166,7 +177,7 @@ _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + ...@@ -166,7 +177,7 @@ _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
_S3_LOG_FORMAT = ( _S3_LOG_FORMAT = (
'\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) ' '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
'\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) ' '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
'\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)" \S+' '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
) )
FORMATS = { FORMATS = {
...@@ -1302,16 +1313,27 @@ class Parser(object): ...@@ -1302,16 +1313,27 @@ class Parser(object):
@staticmethod @staticmethod
def detect_format(file): def detect_format(file):
""" """
Return the format matching this file, or None if none was found. Return the best matching format for this file, or None if none was found.
""" """
logging.debug('Detecting the log format') logging.debug('Detecting the log format')
format = None
format_groups = 0
for name, candidate_format in FORMATS.iteritems(): for name, candidate_format in FORMATS.iteritems():
format = candidate_format.check_format(file) match = candidate_format.check_format(file)
if format: if match:
logging.debug('Format %s matches', name) logging.debug('Format %s matches', name)
return format
# if there's more info in this match, use this format
match_groups = len(match.groups())
if format_groups < match_groups:
format = candidate_format
format_groups = match_groups
else: else:
logging.debug('Format %s does not match', name) logging.debug('Format %s does not match', name)
logging.debug('Format %s is the best match', format.name)
return format
def parse(self, filename): def parse(self, filename):
""" """
......
import functools import functools
import os
import import_logs import import_logs
# utility functions
def add_junk_to_file(path):
file = open(path)
contents = file.read()
file.close()
file = open('tmp.log', 'w')
file.write(contents + ' junk')
file.close()
return 'tmp.log'
def tearDownModule():
if os.path.exists('tmp.log'):
os.remove('tmp.log')
def test_format_detection(): def test_format_detection():
def _test(format_name): def _test(format_name):
file = open('logs/%s.log' % format_name) file = open('logs/%s.log' % format_name)
assert(import_logs.Parser.detect_format(file).name == format_name) format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
def _test_junk(format_name):
tmp_path = add_junk_to_file('logs/%s.log' % format_name)
file = open(tmp_path)
format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
for format_name in import_logs.FORMATS.iterkeys(): for format_name in import_logs.FORMATS.iterkeys():
f = functools.partial(_test, format_name) f = functools.partial(_test, format_name)
f.description = 'Testing autodetection of format ' + format_name f.description = 'Testing autodetection of format ' + format_name
yield f yield f
f = functools.partial(_test_junk, format_name)
f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
yield f
class Options(object): class Options(object):
...@@ -47,15 +77,15 @@ class Resolver(object): ...@@ -47,15 +77,15 @@ class Resolver(object):
class Recorder(object): class Recorder(object):
"""Mock recorder which collects hits but doesn't put their in database.""" """Mock recorder which collects hits but doesn't put their in database."""
recorders = [] recorders = []
@classmethod @classmethod
def add_hits(cls, hits): def add_hits(cls, hits):
cls.recorders.extend(hits) cls.recorders.extend(hits)
def test_replay_tracking_arguments(): def test_replay_tracking_arguments():
"""Test data parsing from sample log file.""" """Test data parsing from sample log file."""
file_ = 'logs_to_tests.log' file_ = 'logs_to_tests.log'
import_logs.stats = import_logs.Statistics() import_logs.stats = import_logs.Statistics()
import_logs.config = Config() import_logs.config = Config()
import_logs.resolver = Resolver() import_logs.resolver = Resolver()
...@@ -148,3 +178,95 @@ def test_replay_tracking_arguments(): ...@@ -148,3 +178,95 @@ def test_replay_tracking_arguments():
assert hits[2]['_id'] == '1da79fc743e8bcc4' assert hits[2]['_id'] == '1da79fc743e8bcc4'
assert hits[2]['dir'] == '1' assert hits[2]['dir'] == '1'
assert hits[2]['_refts'] == '1360047661' assert hits[2]['_refts'] == '1360047661'
def parse_log_file_line(format_name, file_):
format = import_logs.FORMATS[format_name]
file = open(file_)
match = format.check_format(file)
file.close()
return match.groupdict()
# check parsing groups
def check_common_groups(groups):
assert groups['ip'] == '1.2.3.4'
assert groups['date'] == '10/Feb/2012:16:42:07'
assert groups['timezone'] == '-0500'
assert groups['path'] == '/'
assert groups['status'] == '301'
assert groups['length'] == '368'
def check_ncsa_extended_groups(groups):
check_common_groups(groups)
assert groups['referrer'] == '-'
assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
def check_common_vhost_groups(groups):
check_common_groups(groups)
assert groups['host'] == 'www.example.com'
def check_common_complete_groups(groups):
check_ncsa_extended_groups(groups)
assert groups['host'] == 'www.example.com'
def check_iis_groups(groups):
assert groups['date'] == '2012-04-01 00:00:13'
assert groups['path'] == '/foo/bar'
assert groups['query_string'] == 'topCat1=divinity&submit=Search'
assert groups['ip'] == '5.6.7.8'
assert groups['referrer'] == '-'
assert groups['user_agent'] == 'Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7'
assert groups['status'] == '200'
assert groups['length'] == '27028'
assert groups['host'] == 'example.com'
expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent',
'status', 'length', 'host']
for property_name in groups.keys():
assert property_name in expected_hit_properties
def check_s3_groups(groups):
assert groups['host'] == 'www.example.com'
assert groups['date'] == '10/Feb/2012:16:42:07'
assert groups['timezone'] == '-0500'
assert groups['ip'] == '1.2.3.4'
assert groups['path'] == '/index'
assert groups['status'] == '200'
assert groups['length'] == '368'
assert groups['referrer'] == '-'
assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
def check_match_groups(format_name, groups):
symbols = globals()
check_function = symbols['check_' + format_name + '_groups']
return check_function(groups)
# parsing tests
def test_format_parsing():
# test format regex parses correctly
def _test(format_name, path):
groups = parse_log_file_line(format_name, path)
check_match_groups(format_name, groups)
# test format regex parses correctly when there's added junk at the end of the line
def _test_with_junk(format_name, path):
tmp_path = add_junk_to_file(path)
_test(format_name, tmp_path)
for format_name in import_logs.FORMATS.iterkeys():
f = functools.partial(_test, format_name, 'logs/' + format_name + '.log')
f.description = 'Testing parsing of format "%s"' % format_name
yield f
f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log')
f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name
yield f
f = functools.partial(_test, 'common', 'logs/ncsa_extended.log')
f.description = 'Testing parsing of format "common" with ncsa_extended log'
yield f
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
175.41.192.40 - - [09/Aug/2012:10:11:30 +0200] "GET /faq/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 175.41.192.40 - - [09/Aug/2012:10:11:30 +0200] "GET /faq/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
175.41.192.40 - - [09/Aug/2012:10:11:56 +0200] "GET /blog/category/community/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; U; Linux x86_64; ca-ad) AppleWebKit/531.2+ (KHTML, like Gecko) Safari/531.2+ Epiphany/2.30.6" 175.41.192.40 - - [09/Aug/2012:10:11:56 +0200] "GET /blog/category/community/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; U; Linux x86_64; ca-ad) AppleWebKit/531.2+ (KHTML, like Gecko) Safari/531.2+ Epiphany/2.30.6"
175.41.192.40 - - [09/Aug/2012:10:12:03 +0200] "GET /docs/manage-websites/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0" 175.41.192.40 - - [09/Aug/2012:10:12:03 +0200] "GET /docs/manage-websites/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0"
72.44.32.10 - - [09/Aug/2012:15:48:07 +0200] "GET / HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0" 72.44.32.10 - - [09/Aug/2012:15:48:07 +0200] "GET / HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0" junk extra
72.44.32.10 - - [09/Aug/2012:15:48:20 +0200] "GET /download/counter/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; U; Linux x86; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Epiphany/2.30.6 Safari/534.7" 72.44.32.10 - - [09/Aug/2012:15:48:20 +0200] "GET /download/counter/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; U; Linux x86; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Epiphany/2.30.6 Safari/534.7"
72.44.32.10 - - [09/Aug/2012:15:49:48 +0200] "GET /translations/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5" 72.44.32.10 - - [09/Aug/2012:15:49:48 +0200] "GET /translations/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5"
175.41.192.09 - - [09/Aug/2012:22:56:45 +0200] "GET /docs/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0" 175.41.192.09 - - [09/Aug/2012:22:56:45 +0200] "GET /docs/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0" 456 789
175.41.192.09 - - [09/Aug/2012:23:00:42 +0200] "GET /docs/manage-users/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3" 175.41.192.09 - - [09/Aug/2012:23:00:42 +0200] "GET /docs/manage-users/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
79.125.00.21 - - [10/Aug/2012:20:03:40 +0200] "GET /newsletter/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)" 79.125.00.21 - - [10/Aug/2012:20:03:40 +0200] "GET /newsletter/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)"
175.41.192.34 - - [10/Aug/2012:21:59:50 +0200] "GET /faq/how-to/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)" 175.41.192.34 - - [10/Aug/2012:21:59:50 +0200] "GET /faq/how-to/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)"
......
1.2.3.4 - - [11/Aug/2012:18:46:03 +0100] "GET /19.pdf HTTP/1.0" 200 4324023 "-" "Apache-HttpClient/4.2.1 (java 1.5)" 1.2.3.4 - - [11/Aug/2012:18:46:03 +0100] "GET /19.pdf HTTP/1.0" 200 4324023 "-" "Apache-HttpClient/4.2.1 (java 1.5)"
175.41.192.41 - - [11/Aug/2012:18:10:38 +0200] "GET /blog/category/meta/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.24 (KHTML, like Gecko) RockMelt/0.9.58.494 Chrome/11.0.696.71 Safari/534.24" 175.41.192.41 - - [11/Aug/2012:18:10:38 +0200] "GET /blog/category/meta/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.24 (KHTML, like Gecko) RockMelt/0.9.58.494 Chrome/11.0.696.71 Safari/534.24" junk extra
175.41.192.41 - - [11/Aug/2012:18:11:30 +0200] "GET /this/is/not/the/page/i/am/looking/for/ HTTP/1.1" 404 3574 "-" "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 175.41.192.41 - - [11/Aug/2012:18:11:30 +0200] "GET /this/is/not/the/page/i/am/looking/for/ HTTP/1.1" 404 3574 "-" "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
72.44.32.11 - - [11/Aug/2012:19:48:07 +0200] "GET /to-an-error HTTP/1.1" 500 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0" 72.44.32.11 - - [11/Aug/2012:19:48:07 +0200] "GET /to-an-error HTTP/1.1" 500 3574 "-" "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0"
72.44.32.11 - - [11/Aug/2012:19:48:08 +0200] "GET / HTTP/1.1" 200 3574 "-" "Googlebot/2.1 (+http://www.googlebot.com/bot.html)" 72.44.32.11 - - [11/Aug/2012:19:48:08 +0200] "GET / HTTP/1.1" 200 3574 "-" "Googlebot/2.1 (+http://www.googlebot.com/bot.html)"
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter