Newer
Older
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
class Hit(object):
"""
It's a simple container.
"""
def __init__(self, **kwargs):
for key, value in kwargs.iteritems():
setattr(self, key, value)
super(Hit, self).__init__()
class Parser(object):
"""
The Parser parses the lines in a specified file and inserts them into
a Queue.
"""
## All check_* methods are called for each hit and must return True if the
## hit can be imported, False otherwise.
def check_hostname(self, hit):
# Check against config.hostnames.
if not hasattr(hit, 'host') or not config.options.hostnames:
return True
# Accept the hostname only if it matches one pattern in the list.
result = any(
fnmatch.fnmatch(hit.host, pattern)
for pattern in config.options.hostnames
)
if not result:
stats.count_lines_hostname_skipped.increment()
return result
def check_static(self, hit):
for extension in STATIC_EXTENSIONS:
if hit.path.lower().endswith(extension):
if config.options.enable_static:
hit.is_download = True
return True
else:
stats.count_lines_static.increment()
return False
return True
def check_download(self, hit):
for extension in DOWNLOAD_EXTENSIONS:
if hit.path.lower().endswith(extension):
stats.count_lines_downloads.increment()
hit.is_download = True
return True
def check_user_agent(self, hit):
for s in itertools.chain(EXCLUDED_USER_AGENTS, config.options.excluded_useragents):
if s in user_agent:
if config.options.enable_bots:
hit.is_robot = True
return True
else:
stats.count_lines_skipped_user_agent.increment()
return False
def check_http_error(self, hit):
if hit.status.startswith('4') or hit.status.startswith('5'):
if config.options.enable_http_errors:
hit.is_error = True
return True
else:
stats.count_lines_skipped_http_errors.increment()
return False
return True
def check_http_redirect(self, hit):
if hit.status.startswith('3') and hit.status != '304':
if config.options.enable_http_redirects:
hit.is_redirect = True
return True
else:
stats.count_lines_skipped_http_redirects.increment()
return False
return True
def check_path(self, hit):
for excluded_path in config.options.excluded_paths:
if fnmatch.fnmatch(hit.path, excluded_path):
return False
return True
Cyril Bay
a validé
@staticmethod
def detect_format(file):
Cyril Bay
a validé
"""
Return the format matching this file, or None if none was found.
Cyril Bay
a validé
"""
logging.debug('Detecting the log format')
for name, candidate_format in FORMATS.iteritems():
format = candidate_format.check_format(file)
if format:
Cyril Bay
a validé
logging.debug('Format %s matches', name)
Cyril Bay
a validé
else:
logging.debug('Format %s does not match', name)
def parse(self, filename):
"""
Parse the specified filename and insert hits in the queue.
"""
def invalid_line(line, reason):
stats.count_lines_invalid.increment()
Cyril Bay
a validé
if config.options.debug >= 2:
logging.debug('Invalid line detected (%s): %s' % (reason, line))
if filename == '-':
filename = '(stdin)'
file = sys.stdin
else:
if not os.path.exists(filename):
print >> sys.stderr, 'File %s does not exist' % filename
return
else:
if filename.endswith('.bz2'):
open_func = bz2.BZ2File
elif filename.endswith('.gz'):
open_func = gzip.open
else:
open_func = open
file = open_func(filename, 'r')
if config.options.show_progress:
print 'Parsing log %s...' % filename
if config.format:
# The format was explicitely specified.
format = config.format
else:
format = self.detect_format(file)
if format is None:
return fatal_error(
'Cannot guess the logs format. Please give one using '
'either the --log-format-name or --log-format-regex option'
)
# Make sure the format is compatible with the resolver.
resolver.check_format(format)
for lineno, line in enumerate(file):
try:
line = line.decode(config.options.encoding)
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue
stats.count_lines_parsed.increment()
if stats.count_lines_parsed.value <= config.options.skip:
continue
match = format.regex.match(line)
invalid_line(line, 'line did not match')
continue
hit = Hit(
filename=filename,
lineno=lineno,
status=match.group('status'),
full_path=match.group('path'),
is_error=False,
is_redirect=False,
hit.path = hit.full_path
try:
query_string = match.group('query_string')
except IndexError:
# Strip the query string
hit.path = hit.full_path.split(config.options.query_string_delimiter, 1)[0]
try:
query_string = match.group('query_string')
except IndexError:
pass
else:
# Merge the query string
hit.path = '%s%s%s' % (hit.full_path, config.options.query_string_delimiter, query_string)
date_string = match.group('date')
try:
hit.date = datetime.datetime.strptime(date_string, format.date_format)
except ValueError:
invalid_line(line, 'invalid date')
continue
# Parse timezone and substract its value from the date
try:
timezone = float(match.group('timezone'))
except IndexError:
timezone = 0
invalid_line(line, 'invalid timezone')
if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)
try:
hit.referrer = match.group('referrer')
except IndexError:
hit.referrer = ''
if hit.referrer == '-':
hit.referrer = ''
try:
hit.user_agent = match.group('user_agent')
except IndexError:
hit.user_agent = ''
hit.ip = match.group('ip')
try:
hit.length = int(match.group('length'))
except (ValueError, IndexError):
# Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
hit.length = 0
try:
hit.host = match.group('host')
except IndexError:
# Some formats have no host.
pass
# Check if the hit must be excluded.
check_methods = inspect.getmembers(self, predicate=inspect.ismethod)
if all((method(hit) for name, method in check_methods if name.startswith('check_'))):
Recorder.add_hit(hit)
def main():
"""
Start the importing process.
"""
if config.options.show_progress:
stats.start_monitor()
stats.set_time_start()
recorders = Recorder.launch(config.options.recorders)
try:
for filename in config.filenames:
parser.parse(filename)
Recorder.wait_empty()
except KeyboardInterrupt:
pass
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
stats.set_time_stop()
if config.options.show_progress:
stats.stop_monitor()
try:
Recorder.invalidate_reports()
except Piwik.Error, e:
pass
stats.print_summary()
def fatal_error(error, filename=None, lineno=None):
print >> sys.stderr, 'Fatal error: %s' % error
if filename and lineno is not None:
print >> sys.stderr, (
'You can restart the import of "%s" from the point it failed by '
'specifying --skip=%d on the command line.\n' % (filename, lineno)
)
os._exit(1)
if __name__ == '__main__':
try:
piwik = Piwik()
config = Configuration()
stats = Statistics()
resolver = config.get_resolver()
parser = Parser()
main()
except KeyboardInterrupt:
pass