From 2e13a1a2b12e72f15ca2b48301e70bdf20e1f02d Mon Sep 17 00:00:00 2001
From: diosmosis <benaka@piwik.pro>
Date: Thu, 12 Feb 2015 15:18:46 -0800
Subject: [PATCH] Refs #7151, add tests for new log importer options, fix small
 bug where userid not set in hit object, fix ignore group option value
 spliting.

---
 misc/log-analytics/import_logs.py     |  14 +++-
 misc/log-analytics/tests/logs/iis.log |   2 +-
 misc/log-analytics/tests/tests.py     | 102 +++++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index 43448b3c1f..55ec11d4df 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -784,7 +784,7 @@ class Configuration(object):
             self.options.download_extensions = DOWNLOAD_EXTENSIONS
 
         if self.options.regex_groups_to_ignore:
-            self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split())
+            self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(','))
 
     def __init__(self):
         self._parse_args(self._create_parser())
@@ -1969,7 +1969,7 @@ class Parser(object):
 
                 userid = format.get('userid')
                 if userid != '-':
-                    hit.args['uid'] = userid
+                    hit.args['uid'] = hit.userid = userid
             except:
                 pass
 
@@ -2042,10 +2042,16 @@ class Parser(object):
     def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var):
         for group_name, custom_var_name in groups.iteritems():
             if group_name in format.get_all():
+                value = format.get(group_name)
+
+                # don't track the '-' empty placeholder value
+                if value == '-':
+                    continue
+
                 if is_page_var:
-                    hit.add_page_custom_var(custom_var_name, format.get(group_name))
+                    hit.add_page_custom_var(custom_var_name, value)
                 else:
-                    hit.add_visit_custom_var(custom_var_name, format.get(group_name))
+                    hit.add_visit_custom_var(custom_var_name, value)
 
 def main():
     """
diff --git a/misc/log-analytics/tests/logs/iis.log b/misc/log-analytics/tests/logs/iis.log
index 0ec7bf504f..f25cc5fad6 100644
--- a/misc/log-analytics/tests/logs/iis.log
+++ b/misc/log-analytics/tests/logs/iis.log
@@ -2,4 +2,4 @@
 #Version: 1.0
 #Date: 2012-04-01 00:00:13
 #Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken
-2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 - 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 0 0 27028 214 1687
+2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 theuser 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 654 456 27028 214 1687
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index a550e3a388..81b27ad36f 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -95,7 +95,13 @@ class Options(object):
     enable_http_errors = False
     download_extensions = 'doc,pdf'
     custom_w3c_fields = {}
+    dump_log_regex = False
     w3c_time_taken_in_millisecs = False
+    w3c_fields = None
+    w3c_field_regexes = {}
+    regex_group_to_visit_cvars_map = {}
+    regex_group_to_page_cvars_map = {}
+    regex_groups_to_ignore = None
 
 class Config(object):
     """Mock configuration."""
@@ -460,8 +466,6 @@ def test_amazon_cloudfront_web_parsing():
 
     hits = [hit.__dict__ for hit in Recorder.recorders]
 
-    import_logs.logging.debug(hits)
-
     assert hits[0]['status'] == u'200'
     assert hits[0]['userid'] == None
     assert hits[0]['is_error'] == False
@@ -502,8 +506,6 @@ def test_amazon_cloudfront_rtmp_parsing():
 
     hits = [hit.__dict__ for hit in Recorder.recorders]
 
-    import_logs.logging.debug(hits)
-
     assert hits[0]['is_download'] == False
     assert hits[0]['ip'] == u'192.0.2.147'
     assert hits[0]['is_redirect'] == False
@@ -552,3 +554,95 @@ def test_amazon_cloudfront_rtmp_parsing():
     assert hits[1]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b'
 
     assert len(hits) == 2
+
+def test_ignore_groups_option_removes_groups():
+    """Test that the --ignore-groups option removes groups so they do not appear in hits."""
+
+    file_ = 'logs/iis.log'
+
+    # have to override previous globals override for this test
+    import_logs.config.options.custom_w3c_fields = {}
+    Recorder.recorders = []
+    import_logs.parser = import_logs.Parser()
+    import_logs.config.format = None
+    import_logs.config.options.enable_http_redirects = True
+    import_logs.config.options.enable_http_errors = True
+    import_logs.config.options.replay_tracking = False
+    import_logs.config.options.w3c_time_taken_in_millisecs = True
+    import_logs.config.options.regex_groups_to_ignore = set(['userid','generation_time_milli'])
+    import_logs.parser.parse(file_)
+
+    hits = [hit.__dict__ for hit in Recorder.recorders]
+
+    assert hits[0]['userid'] == None
+    assert hits[0]['generation_time_milli'] == 0
+
+def test_regex_group_to_custom_var_options():
+    """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars."""
+
+    file_ = 'logs/iis.log'
+
+    # have to override previous globals override for this test
+    import_logs.config.options.custom_w3c_fields = {}
+    Recorder.recorders = []
+    import_logs.parser = import_logs.Parser()
+    import_logs.config.format = None
+    import_logs.config.options.enable_http_redirects = True
+    import_logs.config.options.enable_http_errors = True
+    import_logs.config.options.replay_tracking = False
+    import_logs.config.options.w3c_time_taken_in_millisecs = True
+    import_logs.config.options.regex_groups_to_ignore = set()
+    import_logs.config.options.regex_group_to_visit_cvars_map = {
+        'userid': "User Name",
+        'date': "The Date"
+    }
+    import_logs.config.options.regex_group_to_page_cvars_map = {
+        'generation_time_milli': 'Geneartion Time',
+        'referrer': 'The Referrer'
+    }
+    import_logs.parser.parse(file_)
+
+    hits = [hit.__dict__ for hit in Recorder.recorders]
+
+    assert hits[0]['args']['_cvar'] == {1: ['The Date', '2012-04-01 00:00:13'], 2: ['User Name', 'theuser']} # check visit custom vars
+    assert hits[0]['args']['cvar'] == {1: ['Geneartion Time', '1687']} # check page custom vars
+
+    assert hits[0]['userid'] == 'theuser'
+    assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13)
+    assert hits[0]['generation_time_milli'] == 1687
+    assert hits[0]['referrer'] == ''
+
+def test_w3c_custom_field_regex_option():
+    """Test that --w3c-field-regex can be used to match custom W3C log fields."""
+
+    file_ = 'logs/iis.log'
+
+    # have to override previous globals override for this test
+    import_logs.config.options.custom_w3c_fields = {}
+    Recorder.recorders = []
+    import_logs.parser = import_logs.Parser()
+    import_logs.config.format = None
+    import_logs.config.options.enable_http_redirects = True
+    import_logs.config.options.enable_http_errors = True
+    import_logs.config.options.replay_tracking = False
+    import_logs.config.options.w3c_time_taken_in_millisecs = True
+    import_logs.config.options.w3c_field_regexes = {
+        'sc-substatus': '(?P<substatus>\S+)',
+        'sc-win32-status': '(?P<win32_status>\S+)'
+    }
+
+    format = import_logs.W3cExtendedFormat()
+
+    file_handle = open(file_)
+    format.check_format(file_handle)
+    match = None
+    while not match:
+        line = file_handle.readline()
+        if not line:
+            break
+        match = format.match(line)
+    file_handle.close()
+
+    assert match is not None
+    assert format.get('substatus') == '654'
+    assert format.get('win32_status') == '456'
-- 
GitLab