diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 632d7b69f230f97e5341ac86319633f0634bfbb3..223ae37624f47f0ee5eaa4c07fb586225bdcfd8a 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -47,6 +47,10 @@ except ImportError: print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.' sys.exit(1) +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict ## @@ -1592,6 +1596,7 @@ class Parser(object): resolver.check_format(format) hits = [] + cache_dates = OrderedDict() for lineno, line in enumerate(file): try: line = line.decode(config.options.encoding) @@ -1669,24 +1674,33 @@ class Parser(object): # Parse date. # We parse it after calling check_methods as it's quite CPU hungry, and # we want to avoid that cost for excluded hits. - date_string = format.get('date') - try: - hit.date = datetime.datetime.strptime(date_string, format.date_format) - except ValueError: - invalid_line(line, 'invalid date') - continue + # To mitigate CPU usage, parsed dates are cached. + date_key = format.get('date') + '|' + format.get('timezone') + hit.date = cache_dates.get(date_key) + if not hit.date: + date_string = format.get('date') + try: + hit.date = datetime.datetime.strptime(date_string, format.date_format) + except ValueError: + invalid_line(line, 'invalid date') + continue - # Parse timezone and substract its value from the date - try: - timezone = float(format.get('timezone')) - except BaseFormatException: - timezone = 0 - except ValueError: - invalid_line(line, 'invalid timezone') - continue + # Parse timezone and substract its value from the date + try: + timezone = float(format.get('timezone')) + except BaseFormatException: + timezone = 0 + except ValueError: + invalid_line(line, 'invalid timezone') + continue + + if timezone: + hit.date -= datetime.timedelta(hours=timezone/100) + + if len(cache_dates) > 3600: + cache_dates.popitem(False) + cache_dates[date_key] = hit.date - if timezone: - hit.date -= datetime.timedelta(hours=timezone/100) if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php if not hit.query_string or not hit.path.lower().endswith('piwik.php'):