From 2e13a1a2b12e72f15ca2b48301e70bdf20e1f02d Mon Sep 17 00:00:00 2001 From: diosmosis <benaka@piwik.pro> Date: Thu, 12 Feb 2015 15:18:46 -0800 Subject: [PATCH] Refs #7151, add tests for new log importer options, fix small bug where userid not set in hit object, fix ignore group option value spliting. --- misc/log-analytics/import_logs.py | 14 +++- misc/log-analytics/tests/logs/iis.log | 2 +- misc/log-analytics/tests/tests.py | 102 +++++++++++++++++++++++++- 3 files changed, 109 insertions(+), 9 deletions(-) diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 43448b3c1f..55ec11d4df 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -784,7 +784,7 @@ class Configuration(object): self.options.download_extensions = DOWNLOAD_EXTENSIONS if self.options.regex_groups_to_ignore: - self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split()) + self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(',')) def __init__(self): self._parse_args(self._create_parser()) @@ -1969,7 +1969,7 @@ class Parser(object): userid = format.get('userid') if userid != '-': - hit.args['uid'] = userid + hit.args['uid'] = hit.userid = userid except: pass @@ -2042,10 +2042,16 @@ class Parser(object): def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var): for group_name, custom_var_name in groups.iteritems(): if group_name in format.get_all(): + value = format.get(group_name) + + # don't track the '-' empty placeholder value + if value == '-': + continue + if is_page_var: - hit.add_page_custom_var(custom_var_name, format.get(group_name)) + hit.add_page_custom_var(custom_var_name, value) else: - hit.add_visit_custom_var(custom_var_name, format.get(group_name)) + hit.add_visit_custom_var(custom_var_name, value) def main(): """ diff --git a/misc/log-analytics/tests/logs/iis.log b/misc/log-analytics/tests/logs/iis.log index 0ec7bf504f..f25cc5fad6 100644 --- a/misc/log-analytics/tests/logs/iis.log +++ b/misc/log-analytics/tests/logs/iis.log @@ -2,4 +2,4 @@ #Version: 1.0 #Date: 2012-04-01 00:00:13 #Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken -2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 - 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 0 0 27028 214 1687 +2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 theuser 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 654 456 27028 214 1687 diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index a550e3a388..81b27ad36f 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -95,7 +95,13 @@ class Options(object): enable_http_errors = False download_extensions = 'doc,pdf' custom_w3c_fields = {} + dump_log_regex = False w3c_time_taken_in_millisecs = False + w3c_fields = None + w3c_field_regexes = {} + regex_group_to_visit_cvars_map = {} + regex_group_to_page_cvars_map = {} + regex_groups_to_ignore = None class Config(object): """Mock configuration.""" @@ -460,8 +466,6 @@ def test_amazon_cloudfront_web_parsing(): hits = [hit.__dict__ for hit in Recorder.recorders] - import_logs.logging.debug(hits) - assert hits[0]['status'] == u'200' assert hits[0]['userid'] == None assert hits[0]['is_error'] == False @@ -502,8 +506,6 @@ def test_amazon_cloudfront_rtmp_parsing(): hits = [hit.__dict__ for hit in Recorder.recorders] - import_logs.logging.debug(hits) - assert hits[0]['is_download'] == False assert hits[0]['ip'] == u'192.0.2.147' assert hits[0]['is_redirect'] == False @@ -552,3 +554,95 @@ def test_amazon_cloudfront_rtmp_parsing(): assert hits[1]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' assert len(hits) == 2 + +def test_ignore_groups_option_removes_groups(): + """Test that the --ignore-groups option removes groups so they do not appear in hits.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.regex_groups_to_ignore = set(['userid','generation_time_milli']) + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['userid'] == None + assert hits[0]['generation_time_milli'] == 0 + +def test_regex_group_to_custom_var_options(): + """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.regex_groups_to_ignore = set() + import_logs.config.options.regex_group_to_visit_cvars_map = { + 'userid': "User Name", + 'date': "The Date" + } + import_logs.config.options.regex_group_to_page_cvars_map = { + 'generation_time_milli': 'Geneartion Time', + 'referrer': 'The Referrer' + } + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['args']['_cvar'] == {1: ['The Date', '2012-04-01 00:00:13'], 2: ['User Name', 'theuser']} # check visit custom vars + assert hits[0]['args']['cvar'] == {1: ['Geneartion Time', '1687']} # check page custom vars + + assert hits[0]['userid'] == 'theuser' + assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13) + assert hits[0]['generation_time_milli'] == 1687 + assert hits[0]['referrer'] == '' + +def test_w3c_custom_field_regex_option(): + """Test that --w3c-field-regex can be used to match custom W3C log fields.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.w3c_field_regexes = { + 'sc-substatus': '(?P<substatus>\S+)', + 'sc-win32-status': '(?P<win32_status>\S+)' + } + + format = import_logs.W3cExtendedFormat() + + file_handle = open(file_) + format.check_format(file_handle) + match = None + while not match: + line = file_handle.readline() + if not line: + break + match = format.match(line) + file_handle.close() + + assert match is not None + assert format.get('substatus') == '654' + assert format.get('win32_status') == '456' -- GitLab